您当前的位置:首页 > 计算机 > 编程开发 > Python

python爬虫6--lxml解析库

时间:05-19来源:作者:点击数:

1. xpath介绍

XML Path Language,即XML路径语言,可以搜索XML文档,也可以搜索HTML文档。

2. 初始化解析

2.1 解析HTML文本:

from lxml import etree     #导入lxml库的etree模块

html = etree.HTML('res.text’)    #调用HTML类初始化,构造了一个Xpath解析对象

result = etree.tostring(html)      #输出修正后的HTML代码

print(result.decode('utf-8')     #转换成str类型

2.2 解析本地文件:

from lxml import etree

html = etree.parse('./test.html',etree.HTMLParser())

result = etree.tostring(html)

print(result.decode('utf-8')

3. 节点获取

本地HTML文本:


<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div>
        <ul>
            <li class="li li-first" name="item"><a href="link1.html"><span>first item</span></a></li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-inactive"><a href="link3.html">third item</a></li>
            <li class="item-1"><a href="link4.html">first item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</body>
</html>

节点获取:
from lxml import etree
html = etree.parse('./test.html',etree.HTMLParser())
3.1获取所有li节点
result = html.xpath('//li')
3.2获取li所有直接a节点
result = html.xpath('.//li/a')
3.3获取href属性为link4.html的a节点的父节点的chalss属性
result = html.xpath('//a[@href="link4.html"]/../@class')
3.4属性匹配,选择class属性为item-0的li节点
result = html.xpath('//li[@class="item-0"]')
3.5文本获取
result = html.xpath('//li[@class="item-0"]/a/text()')
 result = html.xpath('//li[@class="item-0"]//text()')
3.6属性获取
 result = html.xpath('//li/a/@href')
3.7多属性值查找
result = html.xpath('//li[contains(@class,"li")]/a/text()')
3.8 多属性匹配
 result =html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')
3.9顺序选择,注意索引从1开始
 result = html.xpath('//li[1]/a/text()')   #第一个
 result = html.xpath('//li[last()]/a/text()')  #最后一个
 result = html.xpath('//li[position()<3]')  #位置小于3的,即第1和2个
 result = html.xpath('//li[last()-2]/a/text()')   #倒数第三个
3.10节点轴选择
 result = html.xpath('//li[1]/ancestor::*')  #第一个li的所有祖先节点
 result = html.xpath('//li/[1]ancestor::div')  #第一个li的所有div祖先节点
 result = html.xpath('//li[1]/attribute::*')  #第一个li的所有属性值
 result = html.xpath('//li[1]/child::a[@href="link1.html"]')  #获取属性href为link1.html的直接子节点a
 result = html.xpath('//li[1]/descendant::span')  #第一个li的所有span子孙节点
 result = html.xpath('//li[1]/following::*[2]/text()')  #后续第二个节点
 result = html.xpath('//li[1]/following-sibling::*')  #后续所有同级节点
 print(result)
4.案例--猫眼TOP100电影信息抓取
import requests
from lxml import etree
import json


def create_requests(page):
    url = 'https://maoyan.com/board/4'
    data = {
        'offset': page*10
    }
    headers = {
        'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
    }
    res = requests.get(url=url,params=data,headers=headers)
    return res

def download_data(res):
    #将网页解析成xml
    x_etree = etree.HTML(res.text)
    index_list = x_etree.xpath('//dd/i/text()')
    img_list = x_etree.xpath('//dd/a/img[2]/@data-src')
    name_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[1]/a/text()')
    actor_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[2]/text()')
    time_list = x_etree.xpath('//dl//dd/div/div/div[1]/p[3]/text()')
    score_list1 = x_etree.xpath('//dl//dd/div/div/div[2]/p/i[1]/text()')
    score_list2 = x_etree.xpath('//dl//dd/div/div/div[2]/p/i[2]/text()')
    score_list = []
    for i in range(len(score_list1)):
        score_ = score_list1[i] + score_list2[i]
        score_list.append(score_)

    for i in range(len(index_list)):
        movie_dict = {}
        movie_dict['index'] = index_list[i]
        movie_dict['name'] = name_list[i]
        movie_dict['actor'] = actor_list[i].strip()[3:]
        movie_dict['time'] = time_list[i].strip()[5:]
        movie_dict['score'] = score_list[i]
        movie_dict['img'] = img_list[i]
        movies_list.append(movie_dict)
    return movies_list

def save_content(movies_dict):
    with open('./movies.json','w+',encoding='utf-8') as fp:
        # fp.write(str(movies_dict))
        # indent:根据数据格式缩进显示;ensure_ascii:默认使用ascii编码格式输出,中文格式不能正常输出
        json.dump(movies_list,fp,indent=2,ensure_ascii=False)


def main():
    page = int(input('请输入要查询的页数:'))
    print('-----------下载中-----------')
    for page in range(page):
        res = create_requests(page)
        movies_list = download_data(res)
        save_content(movies_list)
    else:
        print('----------下载完成----------')

if __name__ == '__main__':
    movies_list = []
    main()
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门