1、创建Scrapy项目
- scrapy startproject PosClient
2.进入项目目录,使用命令genspider创建Spider
- scrapy genspider posclient XXXX.com
3、定义要抓取的数据(处理items.py文件)
- # -*- coding: utf-8 -*-
- import scrapy
-
- class PosclientItem(scrapy.Item):
- # 序号
- number_list= scrapy.Field()
- # 客户手机号
- client_phone = scrapy.Field()
- # 客户姓名
- client_name = scrapy.Field()
- # 客户地址
- client_add = scrapy.Field()
- # 注册时间
- client_date = scrapy.Field()
- # 采购金额
- client_sale = scrapy.Field()
- # 采购次数
- client_sale_num = scrapy.Field()
- # 种植面积
- client_area = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:posclient.py)
- # -*- coding: utf-8 -*-
- import scrapy
- from PosClient.items import PosclientItem
-
- class PosclientSpider(scrapy.Spider):
- name = 'posclient'
- allowed_domains = ['XXXX.com']
- # 登陆界面网址
- login_page = 'https://pos.XXXX.com/login.html'
- offset = 1
- # 登陆后需要爬取的页面网址前半部分
- url = 'https://pos.XXXX.com/client/p='
- # 拼接爬取的页面网址
- start_urls = [url + str(offset)]
- username = input("请输入账号:")
- password = input("请输入密码:")
- # 名字不可更改
- def start_requests(self):
- yield scrapy.Request(url=self.login_page, callback=self.login)
- # 登陆,处理form表单
- def login(self, response):
- yield scrapy.FormRequest.from_response(
- response,
- formdata={"j_username": self.username, "j_password": self.password},
- callback=self.parse_page
- )
-
- # 获取登录成功状态,访问需要登录后才能访问的页面
- def parse_page(self, response):
- if "loginerror" in response.body.decode('utf-8'):
- print("登录失败,错误的手机号码或密码!")
- if "</span>首页" in response.body.decode('utf-8'):
- print("欢迎您'%s',成功登陆POS管理系统!" % self.username)
- # 登录成功后获取商品列表页,并回调parse()函数处理数据
- full_url = self.url + str(self.offset)
- yield scrapy.Request(full_url, callback=self.parse)
-
- def parse(self, response):
- # 获取下一页的链接地址
- next_url_list = response.xpath('//div[@class="dataTables_paginate paging_full_numbers"]/span/span/a/@href').extract()
- for each in response.xpath('//div[@class="dataTables_wrapper"]'):
- item = PosclientItem()
- # 序号,首尾有空格
- number_list = each.xpath('.//td[1]/text()').extract()
- # 客户手机号
- client_phone = each.xpath('.//td[2]/a[1]/text()').extract()
- # 客户姓名
- client_name = each.xpath('.//td[2]/a[2]/text()').extract()
- # 客户地址
- client_add = each.xpath('.//td[3]/a/text()').extract()
- # 注册时间
- client_date = each.xpath('.//tbody//td[4]/a/text()').extract()
- # 采购金额,首尾有空格
- client_sale = each.xpath('.//tbody//td[5]/a/text()').extract()
- # 采购次数
- client_sale_num = each.xpath('.//tbody//td[6]/a/text()').extract()
- # 种植面积,首尾有空格
- client_area = each.xpath('.//tbody//td[7]/text()').extract()
- for i in range(len(client_phone)):
- item['number_list'] = number_list[i].strip()
- item['client_phone'] = client_phone[i].strip()
- item['client_name'] = client_name[i].strip()
- item['client_add'] = client_add[i].strip()
- # 日期结尾有个.0去掉,2017-11-10 11:04:40.0
- item['client_date'] = client_date[i].strip()[:-2]
- item['client_sale'] = client_sale[i].strip()
- item['client_sale_num'] = client_sale_num[i].strip()
- item['client_area'] = client_area[i].strip()
- yield item
- # 处理下一页
- for url in next_url_list:
- full_url = 'https://pos.XXXX.com/client.html'+ str(url)
- yield scrapy.Request(url=full_url,callback=self.parse)
5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
- # -*- coding: utf-8 -*-
- import json
- from openpyxl import Workbook
-
- # 转码操作,继承json.JSONEncoder的子类
- class MyEncoder(json.JSONEncoder):
- def default(self, o):
- if isinstance(o, bytes):
- return str(o, encoding='utf-8')
- return json.JSONEncoder.default(self, o)
-
- class PosclientPipeline(object):
- def __init__(self):
- self.wb = Workbook()
- self.ws = self.wb.active
- # 创建表头
- self.ws.append(['序号','客户手机号','客户姓名','客户地址','注册时间','采购金额','采购次数','种植面积'])
-
- def process_item(self, item, spider):
- text = [item['number_list'], item['client_phone'], item['client_name'], item['client_add'],
- item['client_date'], item['client_sale'], item['client_sale_num'],item['client_area']]
- self.ws.append(text)
- return item
- def close_spider(self,spider):
- self.wb.save('pos_client.xlsx')
- print("数据处理完毕,谢谢使用!")
6.配置settings文件(settings.py)
- # Obey robots.txt rules,具体含义参照:https://www.cdsy.xyz/computer/programme/Python/241210/cd64912.html
- ROBOTSTXT_OBEY = False
-
- # 下载延迟
- DOWNLOAD_DELAY = 2
- # Override the default request headers:添加User-Agent信息
- DEFAULT_REQUEST_HEADERS = {
- 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- }
-
- # Configure item pipelines去掉下面注释,打开管道文件
- ITEM_PIPELINES = {
- 'PosClient.pipelines.PosclientPipeline': 300,
- }
-
- # 还可以将日志存到本地文件中(可选添加设置)
- LOG_FILE = "stats.log"
- LOG_LEVEL = "DEBUG"
- # 包含打印信息也一起写进日志里
- LOG_STDOUT = True
7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
- scrapy crawl posclient