利用requests+pyquery/selenium爬取塔读网站的小说（解决JS渲染的问题）

时间：03-27来源：作者：点击数：77

一、环境依赖

安装requests,selenium,pyquery模块，并下载chromedriver,配置好环境。

#python3
pip install requests selenium pyquery

我的selenium配置笔记

#如果用browser = webdriver.Chrome()报错，可以尝试以下命令
options = webdriver.ChromeOptions()
options.add_argument('--disable-extensions')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument("window-size=1024,768")
browser = webdriver.Chrome(options=options)

#镜像文件，根据chrome版本对应下载chromedriver
http://npm.taobao.org/mirrors/chromedriver/
sudo mv chromedriver /usr/bin

二、实现

1. 基本原理

因为http://www.tadu.com/存在JS渲染问题，直接用requests.get()并不能爬取到小说内容，并且请求时必须加上headers，也就是User-Agent，用于骗过反爬措施。

某一章节，切换到preview，见下图

可以看到并没有章节内容，如果直接请求某一章节的url并不能得到我们想要东西。

解决这类问题，有两种办法，一是用selenium(可以模拟浏览器)，这样就能得到所有完整的源代码；二是用继续分析，看看在哪里请求的章节内容，如果能找到正确url就解决问题了。

2. selenium代码

# -*- coding: utf-8 -*-
__author__ = 'wardseptember'
__date__ = '19-01-27'

import requests
import os
from pyquery import PyQuery as pq
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By

# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC


'''
爬取塔读文学(http://www.tadu.com)上的小说
此例子爬取"聊斋志异"
'''

#如果想爬取其他书籍只需更改bookUrl
bookUrl = 'http://www.tadu.com/book/catalogue/735'
chapterDict = {}
bookName = ''
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/71.0.3578.98 Safari/537.36'
}
browser = webdriver.Chrome()  # 启动浏览器，可以发现弹出一个浏览器


# 获取所有章节标题和链接
def getChapterDict(url):
    try:
        html = requests.get(url, timeout=100, headers=headers)
        html.encoding = html.apparent_encoding
        content = html.content
        doc = pq(content)
        global bookName
        bookName = doc('#container .right .book-detail.catalog-tip h1 a').text()
        items = doc('#container .right .detail-chapters ul li h5 a').items()
        for item in items:
            chapterDict[item.text().strip()] = 'http://www.tadu.com' + item.attr('href')
    except Exception as e:
        print(str(e))


def getChapter(chapterUrl):
    # 获取网页内容
    try:
        browser.get(chapterUrl)
        time.sleep(3)
        # 等网页加载完成,才能获得text,继续向下进行
        text = browser.find_element(By.CSS_SELECTOR, '.main_ .text-content-.f-l .article_.c-3').text
        # 处理所的内容，使其更整洁
        text = text.replace('\n', '\n\n    ')
        text = '    ' + text
        return text
    except Exception as e:
        print(str(e))
        return " "


# 写入txt文件
def writeToTxt():
    global bookName
    bookName = bookName + '.txt'
    with open(bookName, 'w', encoding='utf-8') as f:
        f.close()
    try:
        for chapterName, value in chapterDict.items():
            text = getChapter(value)
            with open(bookName, 'a', encoding='utf-8') as f:
                f.write(chapterName + '\n\n\n')
                f.write(text + '\n')
                print(bookName[:-4] + ':' + chapterName + '-->写入成功')
                f.close()
        print("所有章节写入完成")
    except Exception as e:
        print(str(e))


def main():
    getChapterDict(bookUrl)
    writeToTxt()
    browser.close()


if __name__ == '__main__':
    main()

运行截图

3. requests+pyquery实现代码

继续上面所说，既然JS渲染使我们不能直接get到章节内容，那么就去找渲染的内容。

在preview下逐个查看url，可以看到，这个比较长的名字就是我们所要的内容。

切换到Headers，可以看到url

在切换到elements，ctrl+f可以全局搜索，输入刚才url,如e5cb3bfe508be412d823e0590090bd86或着http://m.tadu.com/_book_part/e5cb3bfe508be412d823e0590090bd86

结果如下

那么我们直接解析到这个value的值接解决问题了，现在我们访问一下http://m.tadu.com/_book_part/e5cb3bfe508be412d823e0590090bd86

如图

这就是我们想要的东西，乱码不要紧，utf-8编码一下就行了。

下面上代码

# -*- coding: utf-8 -*-
__author__ = 'wardseptember'
__date__ = '19-01-27'


import requests
import os
from pyquery import PyQuery as pq
import time



'''
爬取塔读文学(http://www.tadu.com)上的小说
此例子爬取"聊斋志异"
'''

bookUrl='http://www.tadu.com/book/catalogue/735'
chapterDict={}
bookName=''
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/71.0.3578.98 Safari/537.36'
}

def getChapterDict(url):
    try:
        html = requests.get(url, timeout=100,headers=headers)
        html.encoding = html.apparent_encoding
        content = html.content
        doc = pq(content)
        global bookName
        bookName=doc('#container .right .book-detail.catalog-tip h1 a').text()
        items=doc('#container .right .detail-chapters ul li h5 a').items()
        for item in items:
            chapterDict[item.text().strip()]='http://www.tadu.com'+item.attr('href')
    except Exception as e:
        print(str(e))

def getChapter_2(chapterUrl):
    try:
        html=requests.get(chapterUrl,timeout=100,headers=headers)
        html.encoding=html.apparent_encoding
        content=html.content
        doc=pq(content)
        textUrl=doc('body #bookPartResourceUrl').attr('value')
        htmlTxt=requests.get(textUrl,timeout=100,headers=headers)
        con=htmlTxt.content
        htmlstr=con.decode('utf-8')[19:-3]
        doc_2=pq(htmlstr)
        items=doc_2('p').items()
        text='    '
        for item in items:
            text=text+item.text()+'\n'
        text = text.replace('\n', '\n\n    ')
        return text
    except Exception as e:
        print(str(e))
        return " "

def writeToTxt():
    global bookName
    bookName=bookName+'.txt'
    with open(bookName, 'w', encoding='utf-8') as f:
        f.close()
    try:
        for chapterName,value in chapterDict.items():
            text=getChapter_2(value)
            with open(bookName,'a',encoding='utf-8') as f:
                f.write(chapterName+'\n\n\n')
                f.write(text+'\n')
                print(bookName[:-4]+':'+chapterName+'-->写入成功')
                f.close()
        print("所有章节写入完成")
    except Exception as e:
        print(str(e))

def main():
    getChapterDict(bookUrl)
    writeToTxt()


if __name__=='__main__':
    main()