爬取不规范html网页文本时,用html5lib解析不规范的html文本
安装解析器:
pip install html5lib
from lxml import etree
from bs4 import BeautifulSoup
# text中<tr>标签只有闭合标签,没有起始标签
text = """
<table>
<td>姓名</td>
<td>年龄</td>
</tr>
<td>出生日期</td>
<td>地址</td>
</tr>
<td>说明</td>
<td>备注</td>
</tr>
</table>
"""
# 默认是lxml解析
html = etree.HTML(text)
# 结果:[],lxml无法正确解析出不规范的标签
print(html.xpath('//table/tr[1]/td[1]/text()'))
#利用BeautifulSoup和html5lib先将不规范的html文本转为规范的文本再解析
soup = BeautifulSoup(text,'html5lib')
print(soup.prettify()) # 结果大概如下,自动补全了标签
"""
<html><head></head><body>
<table>
<tbody>
<tr>
<td>姓名</td>
<td>年龄</td>
</tr>
<tr>
<td>出生日期</td>
<td>地址</td>
</tr>
<tr>
<td>说明</td>
<td>备注</td>
</tr>
</tbody>
</table>
</body></html>
"""
html = etree.HTML(soup.prettify())
# 结果:['\n 姓名\n ']
print(html.xpath('//table/tbody/tr[1]/td[1]/text()'))