1. xpath介绍
XML Path Language,即XML路径语言,可以搜索XML文档,也可以搜索HTML文档。
2. 初始化解析
2.1 解析HTML文本:
from lxml import etree #导入lxml库的etree模块
html = etree.HTML('res.text’) #调用HTML类初始化,构造了一个Xpath解析对象
result = etree.tostring(html) #输出修正后的HTML代码
print(result.decode('utf-8') #转换成str类型
2.2 解析本地文件:
from lxml import etree
html = etree.parse('./test.html',etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8')
3. 节点获取
本地HTML文本:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div>
<ul>
<li class="li li-first" name="item"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</body>
</html>
节点获取:
from lxml import etree
html = etree.parse('./test.html',etree.HTMLParser())
3.1获取所有li节点
result = html.xpath('//li')
3.2获取li所有直接a节点
result = html.xpath('.//li/a')
3.3获取href属性为link4.html的a节点的父节点的chalss属性
result = html.xpath('//a[@href="link4.html"]/../@class')
3.4属性匹配,选择class属性为item-0的li节点
result = html.xpath('//li[@class="item-0"]')
3.5文本获取
result = html.xpath('//li[@class="item-0"]/a/text()')
result = html.xpath('//li[@class="item-0"]//text()')
3.6属性获取
result = html.xpath('//li/a/@href')
3.7多属性值查找
result = html.xpath('//li[contains(@class,"li")]/a/text()')
3.8 多属性匹配
result =html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')
3.9顺序选择,注意索引从1开始
result = html.xpath('//li[1]/a/text()') #第一个
result = html.xpath('//li[last()]/a/text()') #最后一个
result = html.xpath('//li[position()<3]') #位置小于3的,即第1和2个
result = html.xpath('//li[last()-2]/a/text()') #倒数第三个
3.10节点轴选择
result = html.xpath('//li[1]/ancestor::*') #第一个li的所有祖先节点
result = html.xpath('//li/[1]ancestor::div') #第一个li的所有div祖先节点
result = html.xpath('//li[1]/attribute::*') #第一个li的所有属性值
result = html.xpath('//li[1]/child::a[@href="link1.html"]') #获取属性href为link1.html的直接子节点a
result = html.xpath('//li[1]/descendant::span') #第一个li的所有span子孙节点
result = html.xpath('//li[1]/following::*[2]/text()') #后续第二个节点
result = html.xpath('//li[1]/following-sibling::*') #后续所有同级节点
print(result)
4.案例--猫眼TOP100电影信息抓取
import requests
from lxml import etree
import json
def create_requests(page):
url = 'https://maoyan.com/board/4'
data = {
'offset': page*10
}
headers = {
'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
}
res = requests.get(url=url,params=data,headers=headers)
return res
def download_data(res):
#将网页解析成xml
x_etree = etree.HTML(res.text)
index_list = x_etree.xpath('//dd/i/text()')
img_list = x_etree.xpath('//dd/a/img[2]/@data-src')
name_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[1]/a/text()')
actor_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[2]/text()')
time_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[3]/text()')
score_list1 = x_etree.xpath('//dl//dd/div/div/div[2]/p/i[1]/text()')
score_list2 = x_etree.xpath('//dl//dd/div/div/div[2]/p/i[2]/text()')
score_list = []
for i in range(len(score_list1)):
score_ = score_list1[i] + score_list2[i]
score_list.append(score_)
for i in range(len(index_list)):
movie_dict = {}
movie_dict['index'] = index_list[i]
movie_dict['name'] = name_list[i]
movie_dict['actor'] = actor_list[i].strip()[3:]
movie_dict['time'] = time_list[i].strip()[5:]
movie_dict['score'] = score_list[i]
movie_dict['img'] = img_list[i]
movies_list.append(movie_dict)
return movies_list
def save_content(movies_dict):
with open('./movies.json','w+',encoding='utf-8') as fp:
# fp.write(str(movies_dict))
# indent:根据数据格式缩进显示;ensure_ascii:默认使用ascii编码格式输出,中文格式不能正常输出
json.dump(movies_list,fp,indent=2,ensure_ascii=False)
def main():
page = int(input('请输入要查询的页数:'))
print('-----------下载中-----------')
for page in range(page):
res = create_requests(page)
movies_list = download_data(res)
save_content(movies_list)
else:
print('----------下载完成----------')
if __name__ == '__main__':
movies_list = []
main()