爬虫模型作业代码笔记

mac2026-06-10  2

import requests from lxml import etree class Content: """ Common base class for all articles/pages """ def __init__(self, url, title, body): self.url = url self.title = title self.body = body def print(self): """ Flexible printing function controls output """ print('URL: {}'.format(self.url)) print('TITLE: {}'.format(self.title)) print('BODY:\n{}'.format(self.body))

用的是Xpath定位

class Website: """ Contains information about website structure """ def __init__(self, name, url, x_path,titleTag, bodyTag): self.name = name self.url = url self.x_path = x_path self.titleTag = titleTag self.bodyTag = bodyTag

selectedElems = pageObj.xpath(‘string({})’.format(x_path)) 这是因为内容页的文字是在一个父标签的各个子标签中都有,所以用‘string()’,注意,string()里的定位不需要引号

class Crawler: def __init__(self, site): self.site = site def getPage(self, url): user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" headers={"User-Agent":user_agent} #请求头,headers是一个字典类型 html = requests.get(url,headers=headers).content selector = etree.HTML(html) return selector def get_url(self,selector,x_path): label = selector.xpath(x_path) labels = [] urls = [] for i in label: labels.append(i.attrib) for j in labels: try: urls.append(self.site.url+j['href']) except: continue return urls def safeGet(self, pageObj, x_path): """ Utilty function used to get a content string from a Beautiful Soup object and a selector. Returns an empty string if no object is found for the given selector """ try: selectedElems = pageObj.xpath('string({})'.format(x_path)) except: return '' return selectedElems def parse(self, url): """ Extract content from a given page URL """ selector = self.getPage(url) if selector is not None: title = self.safeGet(selector, self.site.titleTag) body = self.safeGet(selector, self.site.bodyTag) if title != '' and body != '': content = Content(url, title, body) content.print() def crawl(self): selector = self.getPage(self.site.url) urls = self.get_url(selector,self.site.x_path) for url in urls: self.parse(url) siteData = [ ['kjxy', 'http://kjxy.hbue.edu.cn/', '//*[(@id = "wp_news_w6")]//a', '/html/body/div[2]/div[1]/div[3]/div[2]/div/div[1]/div/div/h1','/html/body/div[2]/div[1]/div[3]/div[2]/div/div[1]/div/div/div[5]/div/div'], ['jrxy', 'http://jrxy.hbue.edu.cn/', '//*[(@id = "wp_news_w3")]//a','//h1', '//*[@id="entry"]/div/div'], ['yjs', 'http://yjs.hbue.edu.cn/', '//*[(@id = "wp_news_w3")]//a', '//h1','//*[@id="mainbody3"]/div[2]/div/div[3]/div/div[1]'], ['mpacc', 'http://mpacc.hbue.edu.cn/', '//*[(@id = "wp_news_w4")]//a', '/html/body/div[1]/div[5]/div[2]/div/div/div[1]','//*[@id="entry"]/div/div'] ] reuter1 = Website(siteData[0][0],siteData[0][1],siteData[0][2],siteData[0][3],siteData[0][4]) reuter2 = Website(siteData[1][0],siteData[1][1],siteData[1][2],siteData[1][3],siteData[1][4]) reuter3 = Website(siteData[2][0],siteData[2][1],siteData[2][2],siteData[2][3],siteData[2][4]) reuter4 = Website(siteData[3][0],siteData[3][1],siteData[3][2],siteData[3][3],siteData[3][4]) crawler = Crawler(reuter1) crawler.crawl()
最新回复(0)