爬取不规范html网页文本时,用html5lib解析不规范的html文本
安装解析器:
- pip install html5lib
- from lxml import etree
- from bs4 import BeautifulSoup
-
- # text中<tr>标签只有闭合标签,没有起始标签
- text = """
- <table>
- <td>姓名</td>
- <td>年龄</td>
- </tr>
- <td>出生日期</td>
- <td>地址</td>
- </tr>
- <td>说明</td>
- <td>备注</td>
- </tr>
- </table>
- """
- # 默认是lxml解析
- html = etree.HTML(text)
- # 结果:[],lxml无法正确解析出不规范的标签
- print(html.xpath('//table/tr[1]/td[1]/text()'))
-
- #利用BeautifulSoup和html5lib先将不规范的html文本转为规范的文本再解析
- soup = BeautifulSoup(text,'html5lib')
- print(soup.prettify()) # 结果大概如下,自动补全了标签
- """
- <html><head></head><body>
- <table>
- <tbody>
- <tr>
- <td>姓名</td>
- <td>年龄</td>
- </tr>
- <tr>
- <td>出生日期</td>
- <td>地址</td>
- </tr>
- <tr>
- <td>说明</td>
- <td>备注</td>
- </tr>
- </tbody>
- </table>
- </body></html>
- """
- html = etree.HTML(soup.prettify())
- # 结果:['\n 姓名\n ']
- print(html.xpath('//table/tbody/tr[1]/td[1]/text()'))