1、创建Scrapy项目
- scrapy startproject Tencent2Spider
2.进入项目目录,使用命令genspider创建Spider
- scrapy genspider -t crawl tencent "tencent.com"
3、定义要抓取的数据(处理items.py文件)
- # -*- coding: utf-8 -*-
- import scrapy
-
- class Tencent2SpiderItem(scrapy.Item):
- # 职位名称
- positionName = scrapy.Field()
- # 详情链接
- positionLink = scrapy.Field()
- # 职位类别
- positionType = scrapy.Field()
- # 招聘人数
- peopleNumber = scrapy.Field()
- # 工作地点
- workLocation = scrapy.Field()
- # 发布时间
- publishTime = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:tencent.py)
- # -*- coding: utf-8 -*-
- import scrapy
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import CrawlSpider, Rule
- # 如果下面在pycharm中有红色波浪线,参照这个设置:https://www.cdsy.xyz/computer/soft/develop/241211/cd65012.html
- from Tencent2Spider.items import Tencent2SpiderItem
-
- class TencentSpider(CrawlSpider):
- name = 'tencent'
- allowed_domains = ['hr.tencent.com']
- start_urls = ['https://hr.tencent.com/position.php?&start=0#a']
- # LinkExtractor()贴子的匹配规则,遵循正则
- rules = (
- Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_item', follow=True),
- )
- print("数据处理中……")
- def parse_item(self, response):
- for each in response.xpath('//tr[@class="even"]|//tr[@class="odd"]'):
- # 初始化模型对象
- item = Tencent2SpiderItem()
- # 在当前根节点下查找,extract()方法返回的都是unicode字符串
- positionName = each.xpath("./td[1]/a/text()").extract()[0]
- positionLink = each.xpath("./td[1]/a/@href").extract()[0]
- # 发现职位类别有空的可能,还需要进行判断,如果为空改成未知。
- if each.xpath("./td[2]/text()[.!= '']"):
- self.positionType = each.xpath("./td[2]/text()").extract()[0]
- else:
- self.positionType = "未知"
- peopleNumber = each.xpath("./td[3]/text()").extract()[0]
- workLocation = each.xpath("./td[4]/text()").extract()[0]
- publishTime = each.xpath("./td[5]/text()").extract()[0]
-
- # 职位名称
- item['positionName'] = positionName.encode('utf-8')
- # 详情链接
- item['positionLink'] = positionLink.encode('utf-8')
- # 职位类别
- item['positionType'] = self.positionType.encode('utf-8')
- # 招聘人数
- item['peopleNumber'] = peopleNumber.encode('utf-8')
- # 工作地点
- item['workLocation'] = workLocation.encode('utf-8')
- # 发布时间
- item['publishTime'] = publishTime.encode('utf-8')
-
- yield item
5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
- # -*- coding: utf-8 -*-
- import json
-
- # 转码操作,继承json.JSONEncoder的子类
- class MyEncoder(json.JSONEncoder):
- def default(self, o):
- if isinstance(o, bytes):
- return str(o, encoding='utf-8')
- return json.JSONEncoder.default(self, o)
-
- class Tencent2SpiderPipeline(object):
- def __init__(self):
- self.filename = open("tencent.json","w",encoding="utf-8")
- def process_item(self, item, spider):
- text = json.dumps((dict(item)),ensure_ascii=False,cls=MyEncoder) +'\n'
- self.filename.write(text)
- return item
-
- def close_spider(self,spider):
- print("数据处理完毕,谢谢使用!")
- self.filename.close()
6.配置settings文件(settings.py)
- # Obey robots.txt rules,具体含义参照:https://www.cdsy.xyz/computer/programme/Python/241210/cd64912.html
-
- ROBOTSTXT_OBEY = False
-
- # Override the default request headers:添加User-Agent信息
- DEFAULT_REQUEST_HEADERS = {
- 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- }
-
- # Configure item pipelines去掉下面注释,打开管道文件
- ITEM_PIPELINES = {
- 'Tencent2Spider.pipelines.Tencent2SpiderPipeline': 300,
- }
-
- # 还可以将日志存到本地文件中(可选添加设置)
- LOG_FILE = "tencentlog.log"
- LOG_LEVEL = "DEBUG"
7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
- scrapy crawl tencent