最近有朋友在看一本小说,但嫌弃网站广告很烦,让我帮忙写个爬虫爬下来看。xūe微测试了下目标网站,都是些简单的静态页面,大约花10分钟写完脚本开始爬,在这记录一下~
- # coding=utf-8
- """
- # :author: 小书童
- # :url: https://www.cdsy.xyz/computer/programme/Python/
- # :copyright: © 2024 cdsy.xyz
- # :motto: gogogo~.
- """
-
- import requests
- import time
- from lxml import etree
-
-
- def gecko(save_text_path, part=1, chapter=1):
- url = "http://www.erhaoshouzhang.cn/{}/{}.html".format(part, chapter)
- resp = requests.get(url)
- print(resp.status_code, chapter)
-
- if resp.status_code == 200:
- wb_data = resp.content.decode("gbk")
- html = etree.HTML(wb_data)
-
- with open(save_text_path, "a", encoding='utf-8-sig', newline='') as f:
-
- html_title = html.xpath('/html/body/div[2]/div/div[1]/div[1]/h2')
- if html_title is not None:
- f.write(str('第%s章--'%chapter))
- for t in html_title:
- if t.text is not None:
- f.write(t.text)
- f.write('\r\n')
- html_p_data = html.xpath('/html/body/div[2]/div/div[1]/p')
-
- if html_p_data is not None:
- for node_p in html_p_data:
- if node_p.text is not None:
- f.write(node_p.text)
- f.write('\r\n')
- f.write('\r\n')
- html_pre_data = html.xpath('/html/body/div[2]/div/div[1]/pre')
- if html_pre_data is not None:
- for node_pre in html_pre_data:
- if node_pre.text is not None:
- f.write(node_pre.text)
- f.write('\r\n')
- f.write('\r\n')
- html_p_br_data = html.xpath('/html/body/div[2]/div/div[1]/p[1]/text()')
-
- if html_p_br_data is not None:
-
- for t in range(len(html_p_br_data)):
- cache = html_p_br_data[t]
- cache.replace(u'\u3000', u'')
- f.write(cache)
- f.write('\r\n')
- f.write('\r\n')
- elif resp.status_code == 404:
- print("爬取失败,网站链接返回404")
- # sys.exit()
- else:
- raise Exception("Error,result is %s" % resp.json())
- time.sleep(5.0)
-
-
- if __name__ == '__main__':
- """看到第一部有296章,用for逐页爬取"""
- for i in range(1, 297):
-
- #爬虫入参part:因为有1、2、3...好多部,做成了参数
- #爬虫入参chapter:小说章节号
- gecko("./test.txt", part=1, chapter=i)
-
-
- pip3 install lxml
-