1. xpath介绍
XML Path Language,即XML路径语言,可以搜索XML文档,也可以搜索HTML文档。
2. 初始化解析
2.1 解析HTML文本:
- from lxml import etree #导入lxml库的etree模块
-
- html = etree.HTML('res.text’) #调用HTML类初始化,构造了一个Xpath解析对象
-
- result = etree.tostring(html) #输出修正后的HTML代码
-
- print(result.decode('utf-8') #转换成str类型
2.2 解析本地文件:
- from lxml import etree
-
- html = etree.parse('./test.html',etree.HTMLParser())
-
- result = etree.tostring(html)
-
- print(result.decode('utf-8')
3. 节点获取
本地HTML文本:
-
-
- <!DOCTYPE html>
- <html lang="en">
- <head>
- <meta charset="UTF-8">
- <title>Title</title>
- </head>
- <body>
- <div>
- <ul>
- <li class="li li-first" name="item"><a href="link1.html"><span>first item</span></a></li>
- <li class="item-1"><a href="link2.html">second item</a></li>
- <li class="item-inactive"><a href="link3.html">third item</a></li>
- <li class="item-1"><a href="link4.html">first item</a></li>
- <li class="item-0"><a href="link5.html">fifth item</a></li>
- </ul>
- </div>
- </body>
- </html>
-
-
- 节点获取:
- from lxml import etree
- html = etree.parse('./test.html',etree.HTMLParser())
- 3.1获取所有li节点
-
- result = html.xpath('//li')
- 3.2获取li所有直接a节点
-
- result = html.xpath('.//li/a')
- 3.3获取href属性为link4.html的a节点的父节点的chalss属性
-
- result = html.xpath('//a[@href="link4.html"]/../@class')
- 3.4属性匹配,选择class属性为item-0的li节点
- result = html.xpath('//li[@class="item-0"]')
- 3.5文本获取
- result = html.xpath('//li[@class="item-0"]/a/text()')
- result = html.xpath('//li[@class="item-0"]//text()')
- 3.6属性获取
-
- result = html.xpath('//li/a/@href')
- 3.7多属性值查找
-
- result = html.xpath('//li[contains(@class,"li")]/a/text()')
- 3.8 多属性匹配
-
- result =html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')
- 3.9顺序选择,注意索引从1开始
- result = html.xpath('//li[1]/a/text()') #第一个
- result = html.xpath('//li[last()]/a/text()') #最后一个
- result = html.xpath('//li[position()<3]') #位置小于3的,即第1和2个
- result = html.xpath('//li[last()-2]/a/text()') #倒数第三个
- 3.10节点轴选择
-
- result = html.xpath('//li[1]/ancestor::*') #第一个li的所有祖先节点
- result = html.xpath('//li/[1]ancestor::div') #第一个li的所有div祖先节点
- result = html.xpath('//li[1]/attribute::*') #第一个li的所有属性值
- result = html.xpath('//li[1]/child::a[@href="link1.html"]') #获取属性href为link1.html的直接子节点a
- result = html.xpath('//li[1]/descendant::span') #第一个li的所有span子孙节点
- result = html.xpath('//li[1]/following::*[2]/text()') #后续第二个节点
- result = html.xpath('//li[1]/following-sibling::*') #后续所有同级节点
- print(result)
- 4.案例--猫眼TOP100电影信息抓取
-
- import requests
- from lxml import etree
- import json
-
-
- def create_requests(page):
- url = 'https://maoyan.com/board/4'
- data = {
- 'offset': page*10
- }
- headers = {
- 'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
- }
- res = requests.get(url=url,params=data,headers=headers)
- return res
-
- def download_data(res):
- #将网页解析成xml
- x_etree = etree.HTML(res.text)
- index_list = x_etree.xpath('//dd/i/text()')
- img_list = x_etree.xpath('//dd/a/img[2]/@data-src')
- name_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[1]/a/text()')
- actor_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[2]/text()')
- time_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[3]/text()')
- score_list1 = x_etree.xpath('//dl//dd/div/div/div[2]/p/i[1]/text()')
- score_list2 = x_etree.xpath('//dl//dd/div/div/div[2]/p/i[2]/text()')
- score_list = []
- for i in range(len(score_list1)):
- score_ = score_list1[i] + score_list2[i]
- score_list.append(score_)
-
- for i in range(len(index_list)):
- movie_dict = {}
- movie_dict['index'] = index_list[i]
- movie_dict['name'] = name_list[i]
- movie_dict['actor'] = actor_list[i].strip()[3:]
- movie_dict['time'] = time_list[i].strip()[5:]
- movie_dict['score'] = score_list[i]
- movie_dict['img'] = img_list[i]
- movies_list.append(movie_dict)
- return movies_list
-
- def save_content(movies_dict):
- with open('./movies.json','w+',encoding='utf-8') as fp:
- # fp.write(str(movies_dict))
- # indent:根据数据格式缩进显示;ensure_ascii:默认使用ascii编码格式输出,中文格式不能正常输出
- json.dump(movies_list,fp,indent=2,ensure_ascii=False)
-
-
- def main():
- page = int(input('请输入要查询的页数:'))
- print('-----------下载中-----------')
- for page in range(page):
- res = create_requests(page)
- movies_list = download_data(res)
- save_content(movies_list)
- else:
- print('----------下载完成----------')
-
- if __name__ == '__main__':
- movies_list = []
- main()