您当前的位置:首页 > 计算机 > 编程开发 > Python

selenium模拟点击京东商城搜索页产品

时间:12-06来源:作者:点击数:

selenium模拟点击京东商城搜索页产品

# -*- coding: utf-8 -*-
#!/usr/bin/env python

import time
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class JD(object):
    def __init__(self):
        option = ChromeOptions()
        # xpath应用扩展加到浏览器中
        extension_path = 'D:/VM/xpath_2.0.2_0.crx'
        option.add_extension(extension_path)
        # 无界面浏览,使用后特别卡没反应
        # option.add_argument('--headless')
        option.add_argument('--disable-infobars')  # 禁用浏览器正在被自动化程序控制的提示
        # 反爬机制代码开始,采用此代码在F12控制台输入window.navigator.webdriver结果不是True,而是undefined就成功了
        option.add_experimental_option('excludeSwitches',['enable-automation'])
        self.driver = webdriver.Chrome(options=option)
        # self.driver = webdriver.PhantomJS() # 无界面浏览已停止更新,建议使用headless
        # 反爬机制代码结束
        # 窗口最大化
        self.driver.maximize_window()
        # 隐式等待
        # self.driver.implicitly_wait(10)
        url ='http://www.jd.com'
        self.driver.get(url)
        time.sleep(3)
        # print(self.driver.save_screenshot('image.png'))
        # 往搜索框中输入iphone
        kw = self.driver.find_element_by_id('key')
        kw.send_keys('iphone')
        time.sleep(1)
        # 输入完后直接模拟按回车键
        kw.send_keys(Keys.RETURN)
        time.sleep(3)
        # 获取到每页的产品数量[{'第1页': 60}, {'第2页': 60}]
        self.page_count_list = []
        # 页数
        self.page = 1
        print(self.driver.title)
    def product(self):
        time.sleep(1)
        # 按照销量排序
        try:
            print("WebDriverWait点击销量")
            sales = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//div[@id="J_filter"]/div/div[@class="f-sort"]/a[2]')))
            sales.click()
        except:
            print("by_xpath点击销量")
            self.driver.find_element_by_xpath('//div[@id="J_filter"]/div/div[@class="f-sort"]/a[2]').click()
        time.sleep(4)
        # 获取一共多少页
        all_page = self.driver.find_element_by_css_selector('span.p-skip>em>b').text
        while True:
            time.sleep(2)
            # 如果找到加载的这个id:J_scroll_loading,就一直获取最后一个产品,只要报错没获取到就退出循环
            # 如果加载次数过多,则刷新页面
            load_count = 1
            while True:
                try:
                    self.driver.find_element_by_id('J_scroll_loading')
                    print("产品正在加载...")
                    load_count += 1
                    time.sleep(2)
                    if load_count > 7:
                        print("产品加载次数过多,尝试刷新页面")
                        self.driver.refresh()
                        time.sleep(3)
                        load_count = 1
                    try:
                        last_product = WebDriverWait(self.driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]')))
                    except:
                        last_product = self.driver.find_element_by_xpath('//*[@id="J_goodsList"]/ul/li[last()]')
                    self.driver.execute_script("return arguments[0].scrollIntoView();", last_product)
                    time.sleep(3)
                except:
                    break
            try:
                print("WebDriverWait准备获取下一页按钮")
                next_page = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'a.pn-next')))
            except:
                with open("第%s页报错.html"%all_page,'w',encoding='utf-8')as f:
                    f.write(self.driver.page_source)
                print("css_selector准备获取下一页按钮")
                next_page = self.driver.find_element_by_css_selector('a.pn-next')
            time.sleep(2)
            # 将滚动条滚动到next_page元素可见
            self.driver.execute_script("return arguments[0].scrollIntoView();", next_page)
            # self.driver.refresh()
            time.sleep(2)
            # 查找所有产品li,显示等待:最长等待10秒,直到找到
            # products = WebDriverWait(self.driver, 10).until(
            #     EC.presence_of_all_elements_located((By.XPATH, '//div[@id="J_goodsList"]/ul/li')))
            # 这个方法的结果是一个列表
            products = WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'gl-item')))
            length = len(products)
            if length == 30 and self.page < int(all_page):
                with open(str(self.page)+'页30产品.html','w',encoding='utf-8')as f:
                    f.write(self.driver.page_source)
            # products = self.driver.find_elements_by_class_name('gl-item')
            # 当前第几个产品
            num = 1
            print("获取所有产品%s个,共%s页"%(length,all_page))
            # 当前页获取的产品数量{'第1页': 60}
            page_count = {}
            page_count['第%s页' % self.page] = length
            for product in products[:5]:
                time.sleep(1)
                # 滚动到底
                # self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
                row = {}
                sku = product.get_attribute('data-sku')
                # 标题
                row['title'] = product.find_element_by_css_selector('div.p-name>a>em:not(.p-tag)').text
                # 价格
                row['price'] = product.find_element_by_css_selector('strong.J_%s>i'%sku).text
                # 评价数
                row['comment'] = product.find_element_by_id('J_comment_%s'%sku).text + '条评价'
                try:
                    # 店铺名称
                    row['shop'] = product.find_element_by_css_selector('span.J_im_icon>a').text
                    # 店铺链接
                    row['shop_url'] = product.find_element_by_css_selector('span.J_im_icon>a').get_attribute('href')
                except:
                    row['shop'] = '无店铺'
                    row['shop_url'] = '无链接'
                # 产品链接
                row['product_url'] = product.find_element_by_css_selector('div.p-img>a').get_attribute('href')
                # 可领用优惠券
                all_ticket_list = product.find_elements_by_css_selector('div.p-icons>i')
                for all_ticket in all_ticket_list:
                    if all_ticket.get_attribute('data-tips') =='本商品可领用优惠券':
                        row['ticket'] = all_ticket.text
                print("第%s个产品,第%s页,获取数量:%s" % (num,self.page,page_count))
                print(row)
                num+=1
            self.page_count_list.append(page_count)
            time.sleep(1)
            # 没找到pn-next disabled返回-1继续下一页,找到了pn-next disabled就终止循环
            if self.driver.page_source.find('pn-next disabled') != -1:
                break
            else:
                time.sleep(2)
                # self.driver.find_element_by_css_selector('a.pn-next').click()
                next_page.click()
                print("----翻页操作----")
                print(self.page_count_list)
                self.page += 1
        print("处理完毕,获取情况:",self.page_count_list)
        # time.sleep(3)
        # # 刷新页面
        # self.driver.refresh()
        # self.driver.quit()
if __name__ == '__main__':
    jd = JD()
    jd.product()
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐