使用selenium配合lxml进行爬取携程翻页

时间：03-29来源：作者：点击数：69

链接：https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9

我们也不管是动态加载还是其他，使用selenium就是将渲染后的源码交给lxm进行解析，selenium缺点就是有点慢，我们还要拿到详情页的链接，以及我们要进行拼翻页链接，我们就不使用模拟点击方法了

代码实现如下：

from selenium import webdriver
from lxml import etree
import json


class Xiecheng(object):
    def __init__(self):
        self.url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9'
        self.opition = webdriver.ChromeOptions()
        self.opition.add_argument('--headless')
        self.driver = webdriver.Chrome(
            executable_path='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe',
            options=self.opition)

    def get_data(self):
        self.driver.get(self.url)
        self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
        self.driver.implicitly_wait(10)
        data = self.driver.page_source
        return data

    def parse_data(self, data):
        html = etree.HTML(data)
        node_list = html.xpath('//div[@class="list_product_item_border"]')
        data_list = list()
        for node in node_list:
            temp = {}
            temp['标题'] = node.xpath('./div/div[2]/p/span/text()')
            id_list = node.xpath('./parent::div[@class="list_product_box js_product_item"]/@data-track-product-id')
            for id in id_list:
                temp['详情页链接'] = 'https://vacations.ctrip.com/travel/detail/p' + id + '/?city=475'
            price = node.xpath(
                './div/div[2]/div/div[1]/div[2]/span/strong/text()|./div/div[2]/div/div[1]/div/span/strong/text()')
            for p in price:
                temp['价格'] = '￥' + p
            temp['描述信息'] = node.xpath('./div/div[2]/div/div[2]/div[2]/div/p/text()')
            temp['供应商'] = node.xpath('./div/div[2]/div/div[2]/p/text()')
            detail_url = temp['详情页链接']
            self.url = detail_url
            self.driver.get(self.url)
            self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
            self.driver.implicitly_wait(10)
            detail_page = self.driver.page_source
            hr = etree.HTML(detail_page)
            temp['详情描述'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
            temp['服务'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
            temp['特色'] = hr.xpath('//div[@class="rich_content_view_20191129 '
                                    'detail_description_content_view"]/p/text()')
            temp['日程安排'] = hr.xpath('//div[@class="day_title"]/div[2]/text()')
            temp['景点描述'] = hr.xpath('//div[@id="grp-103047-schedule-poi-0"]/div/text()')
            temp['酒店'] = hr.xpath('//a[@class="itinerary_hotel_item js_Expose_Point js_mapPointHook"]/text()')
            data_list.append(temp)
        return data_list

    def next_page(self, data):
        html = etree.HTML(data)
        try:
            id = html.xpath('//*[@id="root"]/div/div[1]/div[8]/div[31]/a[last()]/@data-page')[0]
            next_url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9&p=' + id
            self.url = next_url
            print(self.url)
            return id
        except:
            pass

    def save_data(self, data_list):
        for i in data_list:
            data = json.dumps(i,ensure_ascii=False)
            with open('携程.json', 'ab') as f:
                data = str(data) + ',\n'
                f.write(data.encode())

    def run(self):
        while True:
            data = self.get_data()
            data_list = self.parse_data(data)
            if data_list is None:
                print('空值')
                break
            self.save_data(data_list)
            id = self.next_page(data)
            if id == None:
                break


if __name__ == '__main__':
    xiecheng = Xiecheng()
    xiecheng.run()