链接:https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9
from selenium import webdriver
from lxml import etree
import json
class Xiecheng(object):
def __init__(self):
self.url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9'
self.opition = webdriver.ChromeOptions()
self.opition.add_argument('--headless')
self.driver = webdriver.Chrome(
executable_path='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe',
options=self.opition)
def get_data(self):
self.driver.get(self.url)
self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
self.driver.implicitly_wait(10)
data = self.driver.page_source
return data
def parse_data(self, data):
html = etree.HTML(data)
node_list = html.xpath('//div[@class="list_product_item_border"]')
data_list = list()
for node in node_list:
temp = {}
temp['标题'] = node.xpath('./div/div[2]/p/span/text()')
id_list = node.xpath('./parent::div[@class="list_product_box js_product_item"]/@data-track-product-id')
for id in id_list:
temp['详情页链接'] = 'https://vacations.ctrip.com/travel/detail/p' + id + '/?city=475'
price = node.xpath(
'./div/div[2]/div/div[1]/div[2]/span/strong/text()|./div/div[2]/div/div[1]/div/span/strong/text()')
for p in price:
temp['价格'] = '¥' + p
temp['描述信息'] = node.xpath('./div/div[2]/div/div[2]/div[2]/div/p/text()')
temp['供应商'] = node.xpath('./div/div[2]/div/div[2]/p/text()')
detail_url = temp['详情页链接']
self.url = detail_url
self.driver.get(self.url)
self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
self.driver.implicitly_wait(10)
detail_page = self.driver.page_source
hr = etree.HTML(detail_page)
temp['详情描述'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
temp['服务'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
temp['特色'] = hr.xpath('//div[@class="rich_content_view_20191129 '
'detail_description_content_view"]/p/text()')
temp['日程安排'] = hr.xpath('//div[@class="day_title"]/div[2]/text()')
temp['景点描述'] = hr.xpath('//div[@id="grp-103047-schedule-poi-0"]/div/text()')
temp['酒店'] = hr.xpath('//a[@class="itinerary_hotel_item js_Expose_Point js_mapPointHook"]/text()')
data_list.append(temp)
return data_list
def next_page(self, data):
html = etree.HTML(data)
try:
id = html.xpath('//*[@id="root"]/div/div[1]/div[8]/div[31]/a[last()]/@data-page')[0]
next_url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9&p=' + id
self.url = next_url
print(self.url)
return id
except:
pass
def save_data(self, data_list):
for i in data_list:
data = json.dumps(i,ensure_ascii=False)
with open('携程.json', 'ab') as f:
data = str(data) + ',\n'
f.write(data.encode())
def run(self):
while True:
data = self.get_data()
data_list = self.parse_data(data)
if data_list is None:
print('空值')
break
self.save_data(data_list)
id = self.next_page(data)
if id == None:
break
if __name__ == '__main__':
xiecheng = Xiecheng()
xiecheng.run()