Flask接口签名sign原理与实例代码浅析
381
2022-09-05
网友投稿 381 2022-09-05
Python的Xpath介绍和语法详解
1.简介
XPath是一门在XML和HTML文档中查找信息的语言,可以用来在XML和HTML文档中对元素和属性进行遍历
XPath的安装
Chrome插件XPath Helper
点Chrome浏览器右上角:更多工具-----扩展程序-----谷歌商店--------勾选XPath Helper(需要翻墙)
2.语法详解
#1.选取节点'''/ 如果是在最前面,代表从根节点选取,否则选择某节点下的某个节点.只查询子一辈的节点 /html 查询到一个结果 /div 查询到0个结果,因为根节点以下只有一个html子节点 /html/body 查询到1个结果// 查询所有子孙节点 //head/script //div. 选取当前节点.. 选取当前节点的父节点@ 选取属性 //div[@id] 选择所有带有id属性的div元素
3.要在python中使用xpath,要导入一个库 lxml。
这个是C编写的库,直接pip3 install lxml可能会有一些显示问题,但是不影响使用。
然而程序员特有的代码洁癖让我看见波浪线也不会爽,所以去 -*-coding:utf8 -*-from lxml import etree#1.获取所有tr标签#2.获取第2个tr标签#3.获取所有class等于even的标签#4.获取所有a标签的href属性#5.获取所有的职位信息(纯文本)parser=etree.HTMLParser(encoding='utf-8')html=etree.parse('tencent.html',parser=parser)#1.获取所有tr标签#xpath函数返回的是一个列表# trs=html.xpath('//tr')# print(trs)# for tr in trs:# print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))#2.获取第2个tr标签# trs=html.xpath('//tr[2]')[0]#这样直接找第2个tr标签,实际上会把所有的table下的第二个tr标签找出来,#为了更精准,可以先把table标签找到,再找这个table下的第二个tr标签# trs=html.xpath('//table[@class="tablelist"]//tr[1]')[0]# print(etree.tostring(trs,encoding='utf-8').decode('utf-8'))#3.获取所有class等于even的标签# trs=html.xpath("//tr[@class='even']")# for tr in trs:# print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))#4.获取所有a标签的href属性# a_list=html.xpath('//a/@href')# for a in a_list:# print(a)#5.获取所有的职位信息(纯文本)trs=html.xpath('//tr[position()>1 and position()<11]')positions=[]for tr in trs: #写了//后,则一定会从整个文档找a标签,会无视前面的tr # href=tr.xpath('//a') #写了.后,则获取当前标签下的a标签 href=tr.xpath('.//a/@href')[0] fullurl=' #title文本信息不是td[1]的直接子元素标签,所以要加./td[1]//text() title=tr.xpath('./td[1]//text()')[0] category=tr.xpath('./td[2]/text()')[0] nums=tr.xpath('./td[3]/text()')[0] address=tr.xpath('./td[4]/text()')[0] pubtime=tr.xpath('./td[5]/text()')[0] position={ 'url':fullurl, 'title':title, 'category':category, 'nums':nums, 'pubtime':pubtime } positions.append(position)# print(positions)#6.获取纯文本信息还可以用string# print(html.xpath("string(//tr[1])"))# trs=html.xpath('//tr')# for tr in trs: # print(tr.xpath("string(.)").strip()
5.实战案例,豆瓣电影爬虫
# -*-coding:utf8 -*-#1.将目标网站上的页面抓取下来#2.将抓取下来的数据根据一定的规则进行提取import requestsfrom lxml import etree#1.将目标网站上的页面抓取下来headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 'Referer':'print(etree.tostring(ul,encoding='utf-8').decode('utf-8'))lis=ul.xpath('./li[@data-title]')movies=[]for li in lis: title=li.xpath('@data-title')[0] score=li.xpath('@data-rate')[0] duration=li.xpath('@data-duration')[0] region=li.xpath('@data-region')[0] director=li.xpath('@data-director')[0] actors=li.xpath('@data-actors')[0] thumbnail=li.xpath('.//img/@src')[0] movie={ 'title':title, 'score':score, 'duration':duration, 'region':region, 'director':director, 'actors':actors, 'thumbnail':thumbnail } movies.append(movie)print(movies)
6.实战案例,电影天堂爬虫
# -*-coding:utf8 -*-import requestsfrom lxml import etree# url='= { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}# response=requests.get(url,headers=headers)# 在电影天堂的网页中,因为编码方式,requests库猜错了,所以response.text出现乱码# print(response.text)# text=response.content.decode('gbk')BaseDomain = 'get_detail_url(url): response = requests.get(url, headers=headers) # print(response.encoding) # 默认解码方式ISO-8859-1 # text=response.content.decode('gbk') # 在使用gbk解码时遇到了一些问题,第五页里有特殊字符,无法解析 # 估计是因为xpath默认解码方式和gbk不一致导致的,这时可以直接传requests.text # 因为要获取的是英文字符,不指定解码方式也能得到 html = etree.HTML(response.text) detail_urls = html.xpath('//table[@class="tbspan"]//a/@href') detail_urls = list(map(lambda url: BaseDomain + url, detail_urls)) return detail_urlsdef parse_detail_page(url): response = requests.get(url, headers=headers) text = response.content.decode('gbk') html = etree.HTML(text) title = html.xpath("//font[@color='#07519a' and position()=1]/text()") zoomE = html.xpath("//div[@id='Zoom']")[0] imgs = zoomE.xpath(".//img/@src") cover = imgs[0] screenshot = imgs[1] infos = zoomE.xpath(".//text()") movie = { 'title': title, 'cover': cover, 'screenshot': screenshot } def parse_info(info, rule): return info.replace(rule, '').strip() for index, info in enumerate(infos): if info.startswith('◎年 代'): info = parse_info(info, '◎年 代') movie['year'] = info elif info.startswith('◎产 地'): info = parse_info(info, '◎产 地') movie['country'] = info elif info.startswith('◎类 别'): info = parse_info(info, '◎类 别') movie['category'] = info elif info.startswith('◎豆瓣评分'): info = parse_info(info, '◎豆瓣评分') movie['douban_rating'] = info elif info.startswith('◎片 长'): info = parse_info(info, '◎片 长') movie['duration'] = info elif info.startswith('◎导 演'): info = parse_info(info, '◎导 演') movie['director'] = info elif info.startswith('◎主 演'): info = parse_info(info, '◎主 演') actors = [] actors.append(info) for x in range(index + 1, len(infos)): actor = infos[x].strip() if actor.startswith('◎简 介'): break actors.append(actor) movie['actors'] = actors elif info.startswith('◎简 介 '): info='' for x in range(index+1,len(infos)): if infos[x].startswith('【下载地址】'): break info = info + infos[x].strip() movie['profile']=info download_url = html.xpath("//td[@bgcolor='#fdfddf']//a/@href")[0] movie['download_url']=download_url return moviedef spider(): # url = ['% i for i in range(1, 8)] base_url = ' movies=[] for x in range(1, 8): url = base_url.format(x) detail_urls = get_detail_url(url) for detail_url in detail_urls: movie = parse_detail_page(detail_url) movies.append(movie) print(movies)if __name__ == '__main__': spider()
7.实战案例,腾讯招聘爬虫
# -*-coding:utf8 -*-import requestsfrom lxml import etreebase_url = '= '= { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}positions=[]def parse_url(url): detail_urls=[] response=requests.get(url,headers=headers) html=etree.HTML(response.text) trs=html.xpath("//table[@class='tablelist']//tr[position()>1 and position()<12]") for tr in trs: href=tr.xpath('.//a/@href')[0] url=base_domain+href detail_urls.append(url) return detail_urlsdef parse_detail_page(url): response=requests.get(url,headers=headers) html=etree.HTML(response.text) zoomE=html.xpath('//table[@class="tablelist textl"]')[0] title=zoomE.xpath('.//tr[1]/td/text()')[0] city=zoomE.xpath('.//tr[2]/td[1]/text()')[0] category=zoomE.xpath('.//tr[2]/td[2]/text()')[0] nums=zoomE.xpath('.//tr[2]/td[3]/text()')[0] duty=zoomE.xpath('.//tr[3]//ul//text()') dutys='' for i in duty: dutys=dutys+i.strip() require=zoomE.xpath('.//tr[4]//ul//text()') requires='' for i in require: requires=requires+i.strip() position={ 'title':title, 'city':city, 'category':category, 'nums':nums, 'dutys':dutys, 'requires':requires } return positionif __name__ == '__main__': for i in range(1,10): url=base_url.format(i) detail_urls=parse_url(url) for detail_url in detail_urls: position=parse_detail_page(detail_url) positions.append(position) print(position)
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~