您当前的位置:首页 > 计算机 > 编程开发 > Python

scrapy获取陕西省政府采购网相关数据,存入MongoDB、Redis、MySQL数据库和本地表格(选用中间件随机请求头User-Agent和增加selenium操作)

时间:12-10来源:作者:点击数:
CDSY,CDSY.XYZ

1、创建Scrapy项目

scrapy startproject caigou

2.进入项目目录,使用命令genspider创建Spider

scrapy genspider ccgp ccgp-shaanxi.gov.cn

3、定义要抓取的数据(处理items.py文件)

import scrapy

class CaigouItem(scrapy.Item):
    # id
    notice_id = scrapy.Field()
    # 区域省份名字
    province = scrapy.Field()
    # 地区名字
    area = scrapy.Field()
    # 公告类型
    notice_type_title = scrapy.Field()
    # 公告标题
    notice_title = scrapy.Field()
    # 公告发布时间
    notice_date = scrapy.Field()
    # 公告url
    notice_url = scrapy.Field()
    # 发布单位
    pub_company = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:ccgp.py)

# -*- coding: utf-8 -*-
# 此案例难点是,我们需要爬取的数据是ajax动态请求加载,需要传入相关表单数据才可以返回正确数据
# 同一公告类型不同区域请求的ajax网址是一样的,只是传入的表单参数不一样,所以需要禁止去重dont_filter=True
# 如果用这个则需要在settings中去掉中间件:SeleniumMiddleWare
# 注意获取页数的xpath规则,当我们页数在第五页的时候,总页数会显示成...,所以要从最后一页的按钮上处理
import re
import scrapy
from ..items import CaigouItem

class CcgpSpider(scrapy.Spider):
    name = 'ccgp'
    allowed_domains = ['ccgp-shaanxi.gov.cn']
    start_urls = ['http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&province=province']

    def parse(self, response):
        items = []
        # 公告类型编码: totype('3')
        notice_type_nums = response.xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li/@onclick').extract()
        # 公告类型名称
        notice_type_titles = response.xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li/a/text()').extract()
        patt_notice = r"\d+"
        # 获取省份的名称和相应代码,结果:regionCity('610001','陕西省本级')
        regionCitys = response.xpath('//ul[@id="toRegion"]/li/@onclick').extract()
        patt = r"regionCity\('(\d+)','(.+)'\)"
        # 表单请求网址
        form_url = "http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=&noticetype="
        length = len(notice_type_titles)
        # 测试用,减少公告类型的循环
        for i in range(6,8):
        # for i in range(length):
            notice_num = re.findall(patt_notice,notice_type_nums[i])
            notice_num = notice_num[0]
            notice_type_title = notice_type_titles[i]
            # 测试用,减少省份的循环
            for regionCity in regionCitys[4:6]:
            # for regionCity in regionCitys:
                notice_dict = {}
                result = re.findall(patt, regionCity)
                # 省份名字
                notice_dict['province'] = result[0][1]
                # 省份编码
                notice_dict['province_code'] = result[0][0]
                # 公告类型
                notice_dict['notice_type_title'] = notice_type_title
                # 提交表单的地址
                notice_dict['form_url'] = form_url + notice_num
                items.append(notice_dict)
        # items:[{"province":"陕西省本级","province_code":"610001","notice_type_title":"采购公告",
        # "form_url":"http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=&noticetype=3"},
        # {"province":"陕西省本级","province_code":"610001","notice_type_title":"结果公告",
        # "form_url":"http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=&noticetype=5"}]
        for item in items:
            # 因为只有提交表单后,网页才会显示我们需要的数据和获取多少页
            yield scrapy.FormRequest(url=item['form_url'],meta={"meta_1":item},
                                     formdata={"page.pageNum": "1", "parameters['regionguid']": item['province_code']},
                                     callback=self.parse_pages)

    # 获取页数
    def parse_pages(self,response):
        meta_1 = response.meta['meta_1']
        province = meta_1['province']
        notice_type_title = meta_1['notice_type_title']
        item= {}
        item['province'] = province
        item['notice_type_title'] = notice_type_title

        # 获取总页数 javascript:toPage('',1525);
        re_total_pages = response.xpath('//ul[@class="pagination"]/li[last()-2]/a/@href').get()
        total_pages = re.findall(r"\d+", re_total_pages)
        if not total_pages:
            total_pages = "1"
        else:
            total_pages = total_pages[0]

        # 测试用,总页数限制在3
        if total_pages != '1':
            total_pages = '3'

        # print("[%s][%s]总页数是%s"%(province,notice_type_title,total_pages))
        for page in range(1,int(total_pages)+1):
            yield scrapy.FormRequest(url=meta_1['form_url'],meta={"meta_2":item},
                                     formdata={"page.pageNum": str(page), "parameters['regionguid']": meta_1['province_code']},
                                     callback=self.parse_detail, dont_filter=True)

    def parse_detail(self,response):
        meta_2 = response.meta['meta_2']
        # 获取地区area:列表形式
        areas = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[2]/text()').extract()
        # 公告标题
        notice_titles = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[3]/@title').extract()
        # 公告url
        notice_urls = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[3]/a/@href').extract()
        length = len(notice_titles)
        province = meta_2['province']
        notice_type_title = meta_2['notice_type_title']
        if length > 0:
            for i in range(length):
                item = {}
                item['notice_id'] = notice_urls[i].split("=")[-1]
                # 区域
                item['province'] = province
                # 公告类型
                item['notice_type_title'] = notice_type_title
                # 公告标题
                item['notice_title'] = notice_titles[i]
                # 公告url
                item['notice_url'] = notice_urls[i]
                # 获取地区area
                item['area'] = areas[i]
                yield scrapy.Request(url=item['notice_url'],meta={"meta_3":item},callback=self.parse_info)
        else:
            item = CaigouItem()
            no_data = "无数据"
            item['notice_id'] = province + notice_type_title + no_data
            # 区域
            item['province'] = province
            # 公告类型
            item['notice_type_title'] = notice_type_title
            # 公告标题
            item['notice_title'] = no_data
            # 公告url
            item['notice_url'] = no_data
            # 获取地区area
            item['area'] = no_data
            # 发布单位
            item['pub_company'] = no_data
            # 发布时间
            item['notice_date'] = no_data
            yield item

    def parse_info(self,response):
        meta_3 = response.meta['meta_3']
        item = CaigouItem()
        # id
        item['notice_id'] = meta_3['notice_id']
        # 区域
        item['province'] = meta_3['province']
        # 获取地区area
        item['area'] = meta_3['area']
        # 公告类型
        item['notice_type_title'] = meta_3['notice_type_title']
        # 发布单位 发布单位:陕西炬荣招标代理有限公司
        pub_company = response.xpath('//div[@class="content_about"]/span[1]/text()').get()
        item['pub_company'] = pub_company.strip().split(":")[-1]
        # 公告标题
        item['notice_title'] = meta_3['notice_title']
        # 公告发布时间
        item['notice_date'] = response.xpath('//div[@class="content_about"]/span[2]/em/text()').get()
        # 公告url
        item['notice_url'] = meta_3['notice_url']

        yield item

4、选用:编写提取item数据的Spider(在spiders文件夹下:ccgp2.py,增加中间件:SeleniumMiddleWare,用selenium获取第一个省份和第一个公告类型模拟点击下一页数据的情形)

# -*- coding: utf-8 -*-
# 此案例在ccgp.py基础上,增加了中间件:SeleniumMiddleWare,用selenium获取第一个省份和第一个公告类型模拟点击下一页数据的情形
import re
import time
import scrapy
from scrapy.utils.project import get_project_settings

class CcgpSpider(scrapy.Spider):
    name = 'ccgp2'
    allowed_domains = ['ccgp-shaanxi.gov.cn']
    start_urls = ['http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&province=province']
    next_first_datas = []

    now_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
    # 先清空日志文件,这样重新写入的时候跟直接删掉日志文件是一个效果
    with open(get_project_settings().get('LOG_FILE'), 'w+',encoding='utf-8')as f:
        f.write('清空日志完成,当前时间为:%s\n'%(now_time))

    def parse(self, response):
        # 获取总页数 javascript:toPage('',1525);
        re_total_pages = response.xpath('//ul[@class="pagination"]/li[last()-2]/a/@href').get()
        total_pages = re.findall(r"\d+", re_total_pages)
        if not total_pages:
            total_pages = "无数据"
        else:
            total_pages = total_pages[0]
        if total_pages != "无数据":

            # 测试用只要最后3页数据,可去掉
            page = int(total_pages) - 2

            notice_nums = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[1]/text()').extract()
            notice_type_title_num = 1
            regionCity_num = 1
            # 如果下一页的第一个数据和上一页的第一个数据一样则退出系统,为防止列表过于庞大,只要下一个数据不在列表中,则清空列表
            # 下一页的第一个数据的网址id(可能有一样的),所以换成获取序号
            notice_num_first = notice_nums[0]
            if notice_num_first not in self.next_first_datas:
                self.next_first_datas.clear()
                self.next_first_datas.append(notice_num_first)
                yield scrapy.Request(url=response.request.url, meta={"next_page": True,
                                    "notice_type_title_num":notice_type_title_num,
                                   "regionCity_num":regionCity_num,"page":page}, dont_filter=True)

                # 获取地区area:列表形式
                areas = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[2]/text()').extract()
                # 公告标题
                notice_titles = response.xpath(
                    '//div[@class="list-box"]/table//tbody/tr/td[3]/@title').extract()
                # 公告url
                notice_urls = response.xpath(
                    '//div[@class="list-box"]/table//tbody/tr/td[3]/a/@href').extract()
                length = len(notice_titles)
                for i in range(length):
                    item = {}
                    item['notice_id'] = notice_urls[i].split("=")[-1]
                    item['area'] = areas[i].strip()
                    item['notice_title'] = notice_titles[i].strip()
                    item['notice_url'] = notice_urls[i]
                    yield item

            else:
                # 获取当前页码
                current_page = response.xpath('//ul[@class="pagination"]/li[@class="active"]/a/text()').get()
                print("已经获取到最后一页:%s" % current_page)
        else:
            no_data = "无数据"
            item = {}
            item['area'] = no_data
            item['notice_title'] = no_data
            item['notice_url'] = no_data
            yield item

5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)

# -*- coding: utf-8 -*-

import time
from openpyxl import Workbook
import pymysql
import pymongo
import redis
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
from scrapy import Item

class XlsxPipeline(object):
    def __init__(self):
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.title = "采购网数据表"
        self.ws.append(['notice_id','区域','地区','公告类型',
                        '发布单位','公告标题',
                       '公告发布时间','公告url',])
    def process_item(self, item, spider):
        text = [item['notice_id'],item['province'],item['area'],item['notice_type_title'],
                item['pub_company'],item['notice_title'],
                item['notice_date'],item['notice_url'],]
        self.ws.append(text)
        return item

    def close_spider(self,spider):
        file_end_name = time.strftime("%Y-%m-%d",time.localtime())
        self.wb.save(spider.name + file_end_name + ".xlsx")
        print("表格处理完毕!")

class MysqlPipeline():
    @classmethod
    def from_crawler(cls,crawler):
        cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
        cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
        cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
        cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
        cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
        cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
        return cls()

    def open_spider(self,spider):
        self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
                        db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
        self.cursor = self.db.cursor()
        # 记录插入了几条数据
        self.num = 0

    def process_item(self,item,spider):
        try:
            sql = """CREATE TABLE IF NOT EXISTS caigou(notice_id VARCHAR(40)PRIMARY KEY NOT NULL ,
            province VARCHAR(10),area VARCHAR(17),notice_type_title VARCHAR(8),pub_company VARCHAR(60),
            notice_title VARCHAR(150),notice_date VARCHAR(20),notice_url VARCHAR(150))ENGINE=Innodb DEFAULT CHARSET=utf8mb4;"""
            self.cursor.execute(sql)
        except:
            pass
        try:
            self.cursor.execute("select notice_id from caigou WHERE notice_id=%s",item['notice_id'])
            switch = self.cursor.fetchone()
            keys,values = zip(*item.items())
            if switch:
                sql = "insert into caigou({})VALUES ({})on duplicate key update {};".format(
                    ','.join(keys),
                    ','.join(['%s'] * len(values)),
                    ','.join(["{}=%s".format(k)for k in keys])
                )
                self.cursor.execute(sql,values * 2)
            else:
                self.cursor.execute("insert into caigou({})VALUES ({});".format(
                    ','.join(keys),
                    ','.join(['%s'] * len(values))
                ),values)
                self.num += 1
            self.db.commit()
        except Exception as e:
            print("MYSQL出错:",e)
            self.db.rollback()
        return item

    def close_spider(self,spider):
        print("MYSQL处理完毕,本次共计增加%s条数据!" % self.num)
        self.cursor.close()
        self.db.close()

class MongoPipeline():
    MONGO_HOST = get_project_settings().get('MONGO_HOST')
    MONGO_PORT = get_project_settings().get('MONGO_PORT')
    MONGO_DB = get_project_settings().get('MONGO_DB')

    def open_spider(self,spider):
        self.cli = pymongo.MongoClient(self.MONGO_HOST,self.MONGO_PORT)
        # 记录插入了几条数据
        self.num = 0

    def process_item(self, item, spider):
        try:
            self.db = self.cli[self.MONGO_DB]
            ccgp = self.db[spider.name]
            # count = ccgp.find().count()
            data = dict(item) if isinstance(item, Item) else item
            notice_title = item['notice_title']
            notice_id = item['notice_id']
            count = ccgp.find({'notice_id':notice_id}).count()
            if count == 0:
                print("%s添加mongo数据库中..."%notice_title)
                ccgp.insert(data)
                self.num += 1
            else:
                print("%s:该数据已存在无需添加!" % notice_title)
        except Exception as e:
            print("mongodb出错:",e)
        return item

    def close_spider(self, spider):
        print("MongoDB数据库处理完毕,共计增加%s条数据!" % self.num)
        self.cli.close()


class RedisPipeline():
    @classmethod
    def from_crawler(cls,crawler):
        cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
        cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
        cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
        cls.REDIS_decode_responses = crawler.settings.get('REDIS_decode_responses')
        return cls()
    def open_spider(self,spider):
        try:
            self.redis_client = redis.StrictRedis(self.REDIS_HOST,self.REDIS_PORT,
                                                  self.REDIS_DBNAME,self.REDIS_decode_responses)
        except Exception as e:
            print("redis数据库出错:",e)

    def process_item(self, item, spider):
        if self.redis_client.sadd("ccgp:items",item['notice_id']):
            return item
        raise DropItem

    def close_spider(self, spider):
        print("redis处理完毕!")

6.配置settings文件(settings.py)

# 还可以将日志存到本地文件中(可选添加设置)
LOG_FILE = "ccgp.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True


MYSQL_HOST = "localhost"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWD = "123456"
MYSQL_DBNAME = "python5"
MYSQL_CHARSET = "utf8mb4"

MONGO_HOST = "localhost"
MONGO_PORT = 27017
MONGO_DB = "py4"

REDIS_HOST = "localhost"
REDIS_PORT = 6379
REDIS_DBNAME = 4
REDIS_decode_response = True # 这样写存的数据是字符串格式


# 增加随机请求头
USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
    "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"
]

ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
# 下载中间件SeleniumMiddleWare注意
DOWNLOADER_MIDDLEWARES = {
   'caigou.middlewares.RandomUserAgentMiddleWare': 543,
   # 'caigou.middlewares.SeleniumMiddleWare': 544,
   # 'caigou.middlewares.CaigouDownloaderMiddleware': 543,
}
# 有选择性的开启
ITEM_PIPELINES = {
   'caigou.pipelines.XlsxPipeline': 300,
   # 'caigou.pipelines.MysqlPipeline': 301,
   # 'caigou.pipelines.MongoPipeline': 302,
   # 'caigou.pipelines.RedisPipeline': 303,
}

7-选用,增加随机请求头中间件和selenium中间件(middlewares.py)

import time
import random
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver

class RandomUserAgentMiddleWare():
    @classmethod
    def from_crawler(cls, crawler):
        cls.USER_AGENT_LIST = crawler.settings.get('USER_AGENT_LIST')
        return cls()

    def process_request(self, request, spider):
        user_agent = random.choice(self.USER_AGENT_LIST)
        print("当前选用的请求头:%s" % user_agent)
        request.headers['User-Agent'] = user_agent

class SeleniumMiddleWare():

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        # crawler.signals.connect(s.engine_started, signal=signals.engine_started)
        # cls.LOG_FILE = crawler.settings.get('LOG_FILE')
        return s

    # 清空日志文件功能可以使用,但是爬虫开始启动的一些日志也会清空,可以直接在爬虫文件里写
    # def engine_started(self):
    #     with open(self.LOG_FILE,'w+')as f:
    #         f.write('')
    #     print("清空日志文件内容!")

    def spider_opened(self, spider):
        self.chrome = webdriver.Chrome()
        self.chrome.maximize_window()

    def process_request(self, request, spider):
        # 判断是否为第一次请求,第一次不会有next_page meta数据
        # request.meta第一次结果显示:{'download_timeout': 180.0}
        if not request.meta.get("next_page",False):
            self.chrome.get(request.url)
        # request.meta第二次请求显示:{'next_page': True, 'depth': 1, 'download_timeout': 180.0}
        else:
            if request.meta['depth'] == 1 or request.meta.get("current_page",False):
                notice_type_title_num = request.meta['notice_type_title_num']
                regionCity_num = request.meta['regionCity_num']
                time.sleep(2)
                print("点击区域第%s个" % notice_type_title_num)
                self.chrome.find_element_by_xpath(
                    '//ul[@id="toRegion"]/li['+str(notice_type_title_num)+']').click()
                time.sleep(1)
                print("点击类型第%s个" % notice_type_title_num)
                self.chrome.find_element_by_xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li['+str(
                    regionCity_num)+']').click()

                # 测试用:由于页数太多,我们需要知道最后一页的处理效果
                time.sleep(2)
                self.chrome.find_element_by_id('infoNoticeInputPage').clear()
                time.sleep(1)
                self.chrome.find_element_by_id('infoNoticeInputPage').send_keys(request.meta['page'])
                time.sleep(1)
                self.chrome.find_element_by_xpath('//ul[@class="pagination"]/li[last()]/button').click()
                time.sleep(1)

            else:
                # 点击下一页
                self.chrome.find_element_by_xpath('//ul[@class="pagination"]/li[last()-3]/a').click()
        time.sleep(2)
        html = self.chrome.page_source
        return HtmlResponse(url=request.url,body=html.encode('utf-8'))

    def spider_closed(self, spider):
        time.sleep(3)
        print("准备关闭浏览器...")
        self.chrome.quit()

7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表

CREATE TABLE IF NOT EXISTS caigou(notice_id VARCHAR(40)PRIMARY KEY NOT NULL ,
province VARCHAR(10),area VARCHAR(17),notice_type_title VARCHAR(8),
pub_company VARCHAR(60),notice_title VARCHAR(150),
notice_date VARCHAR(20),notice_url VARCHAR(150))ENGINE=Innodb DEFAULT CHARSET=utf8mb4;

8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

scrapy crawl ccgp
或
scrapy crawl ccgp2

 

CDSY,CDSY.XYZ
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐