selenium模拟点击京东商城搜索页产品
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import time
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class JD(object):
def __init__(self):
option = ChromeOptions()
# xpath应用扩展加到浏览器中
extension_path = 'D:/VM/xpath_2.0.2_0.crx'
option.add_extension(extension_path)
# 无界面浏览,使用后特别卡没反应
# option.add_argument('--headless')
option.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
# 反爬机制代码开始,采用此代码在F12控制台输入window.navigator.webdriver结果不是True,而是undefined就成功了
option.add_experimental_option('excludeSwitches',['enable-automation'])
self.driver = webdriver.Chrome(options=option)
# self.driver = webdriver.PhantomJS() # 无界面浏览已停止更新,建议使用headless
# 反爬机制代码结束
# 窗口最大化
self.driver.maximize_window()
# 隐式等待
# self.driver.implicitly_wait(10)
url ='http://www.jd.com'
self.driver.get(url)
time.sleep(3)
# print(self.driver.save_screenshot('image.png'))
# 往搜索框中输入iphone
kw = self.driver.find_element_by_id('key')
kw.send_keys('iphone')
time.sleep(1)
# 输入完后直接模拟按回车键
kw.send_keys(Keys.RETURN)
time.sleep(3)
# 获取到每页的产品数量[{'第1页': 60}, {'第2页': 60}]
self.page_count_list = []
# 页数
self.page = 1
print(self.driver.title)
def product(self):
time.sleep(1)
# 按照销量排序
try:
print("WebDriverWait点击销量")
sales = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div[@id="J_filter"]/div/div[@class="f-sort"]/a[2]')))
sales.click()
except:
print("by_xpath点击销量")
self.driver.find_element_by_xpath('//div[@id="J_filter"]/div/div[@class="f-sort"]/a[2]').click()
time.sleep(4)
# 获取一共多少页
all_page = self.driver.find_element_by_css_selector('span.p-skip>em>b').text
while True:
time.sleep(2)
# 如果找到加载的这个id:J_scroll_loading,就一直获取最后一个产品,只要报错没获取到就退出循环
# 如果加载次数过多,则刷新页面
load_count = 1
while True:
try:
self.driver.find_element_by_id('J_scroll_loading')
print("产品正在加载...")
load_count += 1
time.sleep(2)
if load_count > 7:
print("产品加载次数过多,尝试刷新页面")
self.driver.refresh()
time.sleep(3)
load_count = 1
try:
last_product = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]')))
except:
last_product = self.driver.find_element_by_xpath('//*[@id="J_goodsList"]/ul/li[last()]')
self.driver.execute_script("return arguments[0].scrollIntoView();", last_product)
time.sleep(3)
except:
break
try:
print("WebDriverWait准备获取下一页按钮")
next_page = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'a.pn-next')))
except:
with open("第%s页报错.html"%all_page,'w',encoding='utf-8')as f:
f.write(self.driver.page_source)
print("css_selector准备获取下一页按钮")
next_page = self.driver.find_element_by_css_selector('a.pn-next')
time.sleep(2)
# 将滚动条滚动到next_page元素可见
self.driver.execute_script("return arguments[0].scrollIntoView();", next_page)
# self.driver.refresh()
time.sleep(2)
# 查找所有产品li,显示等待:最长等待10秒,直到找到
# products = WebDriverWait(self.driver, 10).until(
# EC.presence_of_all_elements_located((By.XPATH, '//div[@id="J_goodsList"]/ul/li')))
# 这个方法的结果是一个列表
products = WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, 'gl-item')))
length = len(products)
if length == 30 and self.page < int(all_page):
with open(str(self.page)+'页30产品.html','w',encoding='utf-8')as f:
f.write(self.driver.page_source)
# products = self.driver.find_elements_by_class_name('gl-item')
# 当前第几个产品
num = 1
print("获取所有产品%s个,共%s页"%(length,all_page))
# 当前页获取的产品数量{'第1页': 60}
page_count = {}
page_count['第%s页' % self.page] = length
for product in products[:5]:
time.sleep(1)
# 滚动到底
# self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
row = {}
sku = product.get_attribute('data-sku')
# 标题
row['title'] = product.find_element_by_css_selector('div.p-name>a>em:not(.p-tag)').text
# 价格
row['price'] = product.find_element_by_css_selector('strong.J_%s>i'%sku).text
# 评价数
row['comment'] = product.find_element_by_id('J_comment_%s'%sku).text + '条评价'
try:
# 店铺名称
row['shop'] = product.find_element_by_css_selector('span.J_im_icon>a').text
# 店铺链接
row['shop_url'] = product.find_element_by_css_selector('span.J_im_icon>a').get_attribute('href')
except:
row['shop'] = '无店铺'
row['shop_url'] = '无链接'
# 产品链接
row['product_url'] = product.find_element_by_css_selector('div.p-img>a').get_attribute('href')
# 可领用优惠券
all_ticket_list = product.find_elements_by_css_selector('div.p-icons>i')
for all_ticket in all_ticket_list:
if all_ticket.get_attribute('data-tips') =='本商品可领用优惠券':
row['ticket'] = all_ticket.text
print("第%s个产品,第%s页,获取数量:%s" % (num,self.page,page_count))
print(row)
num+=1
self.page_count_list.append(page_count)
time.sleep(1)
# 没找到pn-next disabled返回-1继续下一页,找到了pn-next disabled就终止循环
if self.driver.page_source.find('pn-next disabled') != -1:
break
else:
time.sleep(2)
# self.driver.find_element_by_css_selector('a.pn-next').click()
next_page.click()
print("----翻页操作----")
print(self.page_count_list)
self.page += 1
print("处理完毕,获取情况:",self.page_count_list)
# time.sleep(3)
# # 刷新页面
# self.driver.refresh()
# self.driver.quit()
if __name__ == '__main__':
jd = JD()
jd.product()