2025年3月14日 星期五 甲辰(龙)年 月十三 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

scrapy获取陕西省政府采购网相关数据,存入MongoDB、Redis、MySQL数据库和本地表格(选用中间件随机请求头User-Agent和增加selenium操作)

时间:12-10来源:作者:点击数:38
CDSY,CDSY.XYZ

1、创建Scrapy项目

  • scrapy startproject caigou

2.进入项目目录,使用命令genspider创建Spider

  • scrapy genspider ccgp ccgp-shaanxi.gov.cn

3、定义要抓取的数据(处理items.py文件)

  • import scrapy
  • class CaigouItem(scrapy.Item):
  • # id
  • notice_id = scrapy.Field()
  • # 区域省份名字
  • province = scrapy.Field()
  • # 地区名字
  • area = scrapy.Field()
  • # 公告类型
  • notice_type_title = scrapy.Field()
  • # 公告标题
  • notice_title = scrapy.Field()
  • # 公告发布时间
  • notice_date = scrapy.Field()
  • # 公告url
  • notice_url = scrapy.Field()
  • # 发布单位
  • pub_company = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:ccgp.py)

  • # -*- coding: utf-8 -*-
  • # 此案例难点是,我们需要爬取的数据是ajax动态请求加载,需要传入相关表单数据才可以返回正确数据
  • # 同一公告类型不同区域请求的ajax网址是一样的,只是传入的表单参数不一样,所以需要禁止去重dont_filter=True
  • # 如果用这个则需要在settings中去掉中间件:SeleniumMiddleWare
  • # 注意获取页数的xpath规则,当我们页数在第五页的时候,总页数会显示成...,所以要从最后一页的按钮上处理
  • import re
  • import scrapy
  • from ..items import CaigouItem
  • class CcgpSpider(scrapy.Spider):
  • name = 'ccgp'
  • allowed_domains = ['ccgp-shaanxi.gov.cn']
  • start_urls = ['http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&province=province']
  • def parse(self, response):
  • items = []
  • # 公告类型编码: totype('3')
  • notice_type_nums = response.xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li/@onclick').extract()
  • # 公告类型名称
  • notice_type_titles = response.xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li/a/text()').extract()
  • patt_notice = r"\d+"
  • # 获取省份的名称和相应代码,结果:regionCity('610001','陕西省本级')
  • regionCitys = response.xpath('//ul[@id="toRegion"]/li/@onclick').extract()
  • patt = r"regionCity\('(\d+)','(.+)'\)"
  • # 表单请求网址
  • form_url = "http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=&noticetype="
  • length = len(notice_type_titles)
  • # 测试用,减少公告类型的循环
  • for i in range(6,8):
  • # for i in range(length):
  • notice_num = re.findall(patt_notice,notice_type_nums[i])
  • notice_num = notice_num[0]
  • notice_type_title = notice_type_titles[i]
  • # 测试用,减少省份的循环
  • for regionCity in regionCitys[4:6]:
  • # for regionCity in regionCitys:
  • notice_dict = {}
  • result = re.findall(patt, regionCity)
  • # 省份名字
  • notice_dict['province'] = result[0][1]
  • # 省份编码
  • notice_dict['province_code'] = result[0][0]
  • # 公告类型
  • notice_dict['notice_type_title'] = notice_type_title
  • # 提交表单的地址
  • notice_dict['form_url'] = form_url + notice_num
  • items.append(notice_dict)
  • # items:[{"province":"陕西省本级","province_code":"610001","notice_type_title":"采购公告",
  • # "form_url":"http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=&noticetype=3"},
  • # {"province":"陕西省本级","province_code":"610001","notice_type_title":"结果公告",
  • # "form_url":"http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=&noticetype=5"}]
  • for item in items:
  • # 因为只有提交表单后,网页才会显示我们需要的数据和获取多少页
  • yield scrapy.FormRequest(url=item['form_url'],meta={"meta_1":item},
  • formdata={"page.pageNum": "1", "parameters['regionguid']": item['province_code']},
  • callback=self.parse_pages)
  • # 获取页数
  • def parse_pages(self,response):
  • meta_1 = response.meta['meta_1']
  • province = meta_1['province']
  • notice_type_title = meta_1['notice_type_title']
  • item= {}
  • item['province'] = province
  • item['notice_type_title'] = notice_type_title
  • # 获取总页数 javascript:toPage('',1525);
  • re_total_pages = response.xpath('//ul[@class="pagination"]/li[last()-2]/a/@href').get()
  • total_pages = re.findall(r"\d+", re_total_pages)
  • if not total_pages:
  • total_pages = "1"
  • else:
  • total_pages = total_pages[0]
  • # 测试用,总页数限制在3
  • if total_pages != '1':
  • total_pages = '3'
  • # print("[%s][%s]总页数是%s"%(province,notice_type_title,total_pages))
  • for page in range(1,int(total_pages)+1):
  • yield scrapy.FormRequest(url=meta_1['form_url'],meta={"meta_2":item},
  • formdata={"page.pageNum": str(page), "parameters['regionguid']": meta_1['province_code']},
  • callback=self.parse_detail, dont_filter=True)
  • def parse_detail(self,response):
  • meta_2 = response.meta['meta_2']
  • # 获取地区area:列表形式
  • areas = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[2]/text()').extract()
  • # 公告标题
  • notice_titles = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[3]/@title').extract()
  • # 公告url
  • notice_urls = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[3]/a/@href').extract()
  • length = len(notice_titles)
  • province = meta_2['province']
  • notice_type_title = meta_2['notice_type_title']
  • if length > 0:
  • for i in range(length):
  • item = {}
  • item['notice_id'] = notice_urls[i].split("=")[-1]
  • # 区域
  • item['province'] = province
  • # 公告类型
  • item['notice_type_title'] = notice_type_title
  • # 公告标题
  • item['notice_title'] = notice_titles[i]
  • # 公告url
  • item['notice_url'] = notice_urls[i]
  • # 获取地区area
  • item['area'] = areas[i]
  • yield scrapy.Request(url=item['notice_url'],meta={"meta_3":item},callback=self.parse_info)
  • else:
  • item = CaigouItem()
  • no_data = "无数据"
  • item['notice_id'] = province + notice_type_title + no_data
  • # 区域
  • item['province'] = province
  • # 公告类型
  • item['notice_type_title'] = notice_type_title
  • # 公告标题
  • item['notice_title'] = no_data
  • # 公告url
  • item['notice_url'] = no_data
  • # 获取地区area
  • item['area'] = no_data
  • # 发布单位
  • item['pub_company'] = no_data
  • # 发布时间
  • item['notice_date'] = no_data
  • yield item
  • def parse_info(self,response):
  • meta_3 = response.meta['meta_3']
  • item = CaigouItem()
  • # id
  • item['notice_id'] = meta_3['notice_id']
  • # 区域
  • item['province'] = meta_3['province']
  • # 获取地区area
  • item['area'] = meta_3['area']
  • # 公告类型
  • item['notice_type_title'] = meta_3['notice_type_title']
  • # 发布单位 发布单位:陕西炬荣招标代理有限公司
  • pub_company = response.xpath('//div[@class="content_about"]/span[1]/text()').get()
  • item['pub_company'] = pub_company.strip().split(":")[-1]
  • # 公告标题
  • item['notice_title'] = meta_3['notice_title']
  • # 公告发布时间
  • item['notice_date'] = response.xpath('//div[@class="content_about"]/span[2]/em/text()').get()
  • # 公告url
  • item['notice_url'] = meta_3['notice_url']
  • yield item

4、选用:编写提取item数据的Spider(在spiders文件夹下:ccgp2.py,增加中间件:SeleniumMiddleWare,用selenium获取第一个省份和第一个公告类型模拟点击下一页数据的情形)

  • # -*- coding: utf-8 -*-
  • # 此案例在ccgp.py基础上,增加了中间件:SeleniumMiddleWare,用selenium获取第一个省份和第一个公告类型模拟点击下一页数据的情形
  • import re
  • import time
  • import scrapy
  • from scrapy.utils.project import get_project_settings
  • class CcgpSpider(scrapy.Spider):
  • name = 'ccgp2'
  • allowed_domains = ['ccgp-shaanxi.gov.cn']
  • start_urls = ['http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&province=province']
  • next_first_datas = []
  • now_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
  • # 先清空日志文件,这样重新写入的时候跟直接删掉日志文件是一个效果
  • with open(get_project_settings().get('LOG_FILE'), 'w+',encoding='utf-8')as f:
  • f.write('清空日志完成,当前时间为:%s\n'%(now_time))
  • def parse(self, response):
  • # 获取总页数 javascript:toPage('',1525);
  • re_total_pages = response.xpath('//ul[@class="pagination"]/li[last()-2]/a/@href').get()
  • total_pages = re.findall(r"\d+", re_total_pages)
  • if not total_pages:
  • total_pages = "无数据"
  • else:
  • total_pages = total_pages[0]
  • if total_pages != "无数据":
  • # 测试用只要最后3页数据,可去掉
  • page = int(total_pages) - 2
  • notice_nums = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[1]/text()').extract()
  • notice_type_title_num = 1
  • regionCity_num = 1
  • # 如果下一页的第一个数据和上一页的第一个数据一样则退出系统,为防止列表过于庞大,只要下一个数据不在列表中,则清空列表
  • # 下一页的第一个数据的网址id(可能有一样的),所以换成获取序号
  • notice_num_first = notice_nums[0]
  • if notice_num_first not in self.next_first_datas:
  • self.next_first_datas.clear()
  • self.next_first_datas.append(notice_num_first)
  • yield scrapy.Request(url=response.request.url, meta={"next_page": True,
  • "notice_type_title_num":notice_type_title_num,
  • "regionCity_num":regionCity_num,"page":page}, dont_filter=True)
  • # 获取地区area:列表形式
  • areas = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[2]/text()').extract()
  • # 公告标题
  • notice_titles = response.xpath(
  • '//div[@class="list-box"]/table//tbody/tr/td[3]/@title').extract()
  • # 公告url
  • notice_urls = response.xpath(
  • '//div[@class="list-box"]/table//tbody/tr/td[3]/a/@href').extract()
  • length = len(notice_titles)
  • for i in range(length):
  • item = {}
  • item['notice_id'] = notice_urls[i].split("=")[-1]
  • item['area'] = areas[i].strip()
  • item['notice_title'] = notice_titles[i].strip()
  • item['notice_url'] = notice_urls[i]
  • yield item
  • else:
  • # 获取当前页码
  • current_page = response.xpath('//ul[@class="pagination"]/li[@class="active"]/a/text()').get()
  • print("已经获取到最后一页:%s" % current_page)
  • else:
  • no_data = "无数据"
  • item = {}
  • item['area'] = no_data
  • item['notice_title'] = no_data
  • item['notice_url'] = no_data
  • yield item

5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)

  • # -*- coding: utf-8 -*-
  • import time
  • from openpyxl import Workbook
  • import pymysql
  • import pymongo
  • import redis
  • from scrapy.exceptions import DropItem
  • from scrapy.utils.project import get_project_settings
  • from scrapy import Item
  • class XlsxPipeline(object):
  • def __init__(self):
  • self.wb = Workbook()
  • self.ws = self.wb.active
  • self.ws.title = "采购网数据表"
  • self.ws.append(['notice_id','区域','地区','公告类型',
  • '发布单位','公告标题',
  • '公告发布时间','公告url',])
  • def process_item(self, item, spider):
  • text = [item['notice_id'],item['province'],item['area'],item['notice_type_title'],
  • item['pub_company'],item['notice_title'],
  • item['notice_date'],item['notice_url'],]
  • self.ws.append(text)
  • return item
  • def close_spider(self,spider):
  • file_end_name = time.strftime("%Y-%m-%d",time.localtime())
  • self.wb.save(spider.name + file_end_name + ".xlsx")
  • print("表格处理完毕!")
  • class MysqlPipeline():
  • @classmethod
  • def from_crawler(cls,crawler):
  • cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
  • cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
  • cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
  • cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
  • cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
  • cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
  • return cls()
  • def open_spider(self,spider):
  • self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
  • db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
  • self.cursor = self.db.cursor()
  • # 记录插入了几条数据
  • self.num = 0
  • def process_item(self,item,spider):
  • try:
  • sql = """CREATE TABLE IF NOT EXISTS caigou(notice_id VARCHAR(40)PRIMARY KEY NOT NULL ,
  • province VARCHAR(10),area VARCHAR(17),notice_type_title VARCHAR(8),pub_company VARCHAR(60),
  • notice_title VARCHAR(150),notice_date VARCHAR(20),notice_url VARCHAR(150))ENGINE=Innodb DEFAULT CHARSET=utf8mb4;"""
  • self.cursor.execute(sql)
  • except:
  • pass
  • try:
  • self.cursor.execute("select notice_id from caigou WHERE notice_id=%s",item['notice_id'])
  • switch = self.cursor.fetchone()
  • keys,values = zip(*item.items())
  • if switch:
  • sql = "insert into caigou({})VALUES ({})on duplicate key update {};".format(
  • ','.join(keys),
  • ','.join(['%s'] * len(values)),
  • ','.join(["{}=%s".format(k)for k in keys])
  • )
  • self.cursor.execute(sql,values * 2)
  • else:
  • self.cursor.execute("insert into caigou({})VALUES ({});".format(
  • ','.join(keys),
  • ','.join(['%s'] * len(values))
  • ),values)
  • self.num += 1
  • self.db.commit()
  • except Exception as e:
  • print("MYSQL出错:",e)
  • self.db.rollback()
  • return item
  • def close_spider(self,spider):
  • print("MYSQL处理完毕,本次共计增加%s条数据!" % self.num)
  • self.cursor.close()
  • self.db.close()
  • class MongoPipeline():
  • MONGO_HOST = get_project_settings().get('MONGO_HOST')
  • MONGO_PORT = get_project_settings().get('MONGO_PORT')
  • MONGO_DB = get_project_settings().get('MONGO_DB')
  • def open_spider(self,spider):
  • self.cli = pymongo.MongoClient(self.MONGO_HOST,self.MONGO_PORT)
  • # 记录插入了几条数据
  • self.num = 0
  • def process_item(self, item, spider):
  • try:
  • self.db = self.cli[self.MONGO_DB]
  • ccgp = self.db[spider.name]
  • # count = ccgp.find().count()
  • data = dict(item) if isinstance(item, Item) else item
  • notice_title = item['notice_title']
  • notice_id = item['notice_id']
  • count = ccgp.find({'notice_id':notice_id}).count()
  • if count == 0:
  • print("%s添加mongo数据库中..."%notice_title)
  • ccgp.insert(data)
  • self.num += 1
  • else:
  • print("%s:该数据已存在无需添加!" % notice_title)
  • except Exception as e:
  • print("mongodb出错:",e)
  • return item
  • def close_spider(self, spider):
  • print("MongoDB数据库处理完毕,共计增加%s条数据!" % self.num)
  • self.cli.close()
  • class RedisPipeline():
  • @classmethod
  • def from_crawler(cls,crawler):
  • cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
  • cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
  • cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
  • cls.REDIS_decode_responses = crawler.settings.get('REDIS_decode_responses')
  • return cls()
  • def open_spider(self,spider):
  • try:
  • self.redis_client = redis.StrictRedis(self.REDIS_HOST,self.REDIS_PORT,
  • self.REDIS_DBNAME,self.REDIS_decode_responses)
  • except Exception as e:
  • print("redis数据库出错:",e)
  • def process_item(self, item, spider):
  • if self.redis_client.sadd("ccgp:items",item['notice_id']):
  • return item
  • raise DropItem
  • def close_spider(self, spider):
  • print("redis处理完毕!")

6.配置settings文件(settings.py)

  • # 还可以将日志存到本地文件中(可选添加设置)
  • LOG_FILE = "ccgp.log"
  • LOG_LEVEL = "DEBUG"
  • # 包含打印信息也一起写进日志里
  • LOG_STDOUT = True
  • MYSQL_HOST = "localhost"
  • MYSQL_PORT = 3306
  • MYSQL_USER = "root"
  • MYSQL_PASSWD = "123456"
  • MYSQL_DBNAME = "python5"
  • MYSQL_CHARSET = "utf8mb4"
  • MONGO_HOST = "localhost"
  • MONGO_PORT = 27017
  • MONGO_DB = "py4"
  • REDIS_HOST = "localhost"
  • REDIS_PORT = 6379
  • REDIS_DBNAME = 4
  • REDIS_decode_response = True # 这样写存的数据是字符串格式
  • # 增加随机请求头
  • USER_AGENT_LIST = [
  • "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
  • "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
  • "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50",
  • "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"
  • ]
  • ROBOTSTXT_OBEY = False
  • DOWNLOAD_DELAY = 3
  • # 下载中间件SeleniumMiddleWare注意
  • DOWNLOADER_MIDDLEWARES = {
  • 'caigou.middlewares.RandomUserAgentMiddleWare': 543,
  • # 'caigou.middlewares.SeleniumMiddleWare': 544,
  • # 'caigou.middlewares.CaigouDownloaderMiddleware': 543,
  • }
  • # 有选择性的开启
  • ITEM_PIPELINES = {
  • 'caigou.pipelines.XlsxPipeline': 300,
  • # 'caigou.pipelines.MysqlPipeline': 301,
  • # 'caigou.pipelines.MongoPipeline': 302,
  • # 'caigou.pipelines.RedisPipeline': 303,
  • }

7-选用,增加随机请求头中间件和selenium中间件(middlewares.py)

  • import time
  • import random
  • from scrapy import signals
  • from scrapy.http import HtmlResponse
  • from selenium import webdriver
  • class RandomUserAgentMiddleWare():
  • @classmethod
  • def from_crawler(cls, crawler):
  • cls.USER_AGENT_LIST = crawler.settings.get('USER_AGENT_LIST')
  • return cls()
  • def process_request(self, request, spider):
  • user_agent = random.choice(self.USER_AGENT_LIST)
  • print("当前选用的请求头:%s" % user_agent)
  • request.headers['User-Agent'] = user_agent
  • class SeleniumMiddleWare():
  • @classmethod
  • def from_crawler(cls, crawler):
  • # This method is used by Scrapy to create your spiders.
  • s = cls()
  • crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
  • crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
  • # crawler.signals.connect(s.engine_started, signal=signals.engine_started)
  • # cls.LOG_FILE = crawler.settings.get('LOG_FILE')
  • return s
  • # 清空日志文件功能可以使用,但是爬虫开始启动的一些日志也会清空,可以直接在爬虫文件里写
  • # def engine_started(self):
  • # with open(self.LOG_FILE,'w+')as f:
  • # f.write('')
  • # print("清空日志文件内容!")
  • def spider_opened(self, spider):
  • self.chrome = webdriver.Chrome()
  • self.chrome.maximize_window()
  • def process_request(self, request, spider):
  • # 判断是否为第一次请求,第一次不会有next_page meta数据
  • # request.meta第一次结果显示:{'download_timeout': 180.0}
  • if not request.meta.get("next_page",False):
  • self.chrome.get(request.url)
  • # request.meta第二次请求显示:{'next_page': True, 'depth': 1, 'download_timeout': 180.0}
  • else:
  • if request.meta['depth'] == 1 or request.meta.get("current_page",False):
  • notice_type_title_num = request.meta['notice_type_title_num']
  • regionCity_num = request.meta['regionCity_num']
  • time.sleep(2)
  • print("点击区域第%s个" % notice_type_title_num)
  • self.chrome.find_element_by_xpath(
  • '//ul[@id="toRegion"]/li['+str(notice_type_title_num)+']').click()
  • time.sleep(1)
  • print("点击类型第%s个" % notice_type_title_num)
  • self.chrome.find_element_by_xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li['+str(
  • regionCity_num)+']').click()
  • # 测试用:由于页数太多,我们需要知道最后一页的处理效果
  • time.sleep(2)
  • self.chrome.find_element_by_id('infoNoticeInputPage').clear()
  • time.sleep(1)
  • self.chrome.find_element_by_id('infoNoticeInputPage').send_keys(request.meta['page'])
  • time.sleep(1)
  • self.chrome.find_element_by_xpath('//ul[@class="pagination"]/li[last()]/button').click()
  • time.sleep(1)
  • else:
  • # 点击下一页
  • self.chrome.find_element_by_xpath('//ul[@class="pagination"]/li[last()-3]/a').click()
  • time.sleep(2)
  • html = self.chrome.page_source
  • return HtmlResponse(url=request.url,body=html.encode('utf-8'))
  • def spider_closed(self, spider):
  • time.sleep(3)
  • print("准备关闭浏览器...")
  • self.chrome.quit()

7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表

  • CREATE TABLE IF NOT EXISTS caigou(notice_id VARCHAR(40)PRIMARY KEY NOT NULL ,
  • province VARCHAR(10),area VARCHAR(17),notice_type_title VARCHAR(8),
  • pub_company VARCHAR(60),notice_title VARCHAR(150),
  • notice_date VARCHAR(20),notice_url VARCHAR(150))ENGINE=Innodb DEFAULT CHARSET=utf8mb4;

8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

  • scrapy crawl ccgp
  • scrapy crawl ccgp2

 

CDSY,CDSY.XYZ
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐