链接:https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9
- from selenium import webdriver
- from lxml import etree
- import json
-
-
- class Xiecheng(object):
- def __init__(self):
- self.url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9'
- self.opition = webdriver.ChromeOptions()
- self.opition.add_argument('--headless')
- self.driver = webdriver.Chrome(
- executable_path='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe',
- options=self.opition)
-
- def get_data(self):
- self.driver.get(self.url)
- self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
- self.driver.implicitly_wait(10)
- data = self.driver.page_source
- return data
-
- def parse_data(self, data):
- html = etree.HTML(data)
- node_list = html.xpath('//div[@class="list_product_item_border"]')
- data_list = list()
- for node in node_list:
- temp = {}
- temp['标题'] = node.xpath('./div/div[2]/p/span/text()')
- id_list = node.xpath('./parent::div[@class="list_product_box js_product_item"]/@data-track-product-id')
- for id in id_list:
- temp['详情页链接'] = 'https://vacations.ctrip.com/travel/detail/p' + id + '/?city=475'
- price = node.xpath(
- './div/div[2]/div/div[1]/div[2]/span/strong/text()|./div/div[2]/div/div[1]/div/span/strong/text()')
- for p in price:
- temp['价格'] = '¥' + p
- temp['描述信息'] = node.xpath('./div/div[2]/div/div[2]/div[2]/div/p/text()')
- temp['供应商'] = node.xpath('./div/div[2]/div/div[2]/p/text()')
- detail_url = temp['详情页链接']
- self.url = detail_url
- self.driver.get(self.url)
- self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
- self.driver.implicitly_wait(10)
- detail_page = self.driver.page_source
- hr = etree.HTML(detail_page)
- temp['详情描述'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
- temp['服务'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
- temp['特色'] = hr.xpath('//div[@class="rich_content_view_20191129 '
- 'detail_description_content_view"]/p/text()')
- temp['日程安排'] = hr.xpath('//div[@class="day_title"]/div[2]/text()')
- temp['景点描述'] = hr.xpath('//div[@id="grp-103047-schedule-poi-0"]/div/text()')
- temp['酒店'] = hr.xpath('//a[@class="itinerary_hotel_item js_Expose_Point js_mapPointHook"]/text()')
- data_list.append(temp)
- return data_list
-
- def next_page(self, data):
- html = etree.HTML(data)
- try:
- id = html.xpath('//*[@id="root"]/div/div[1]/div[8]/div[31]/a[last()]/@data-page')[0]
- next_url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9&p=' + id
- self.url = next_url
- print(self.url)
- return id
- except:
- pass
-
- def save_data(self, data_list):
- for i in data_list:
- data = json.dumps(i,ensure_ascii=False)
- with open('携程.json', 'ab') as f:
- data = str(data) + ',\n'
- f.write(data.encode())
-
- def run(self):
- while True:
- data = self.get_data()
- data_list = self.parse_data(data)
- if data_list is None:
- print('空值')
- break
- self.save_data(data_list)
- id = self.next_page(data)
- if id == None:
- break
-
-
- if __name__ == '__main__':
- xiecheng = Xiecheng()
- xiecheng.run()
-
-