python爬虫精选07集(汽车之家增量爬虫案例)
时间:04-24来源:作者:点击数:50
一.实现思路
- 1.爬取地址
- 汽车之家 - 二手车 - 价格从低到高
- https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp1exx0/
-
- 2.爬取目标
- 所有汽车的 型号、行驶里程、上牌时间、档位、排量、车辆所在地、价格
-
- 3.爬取分析
- *********一级页面需抓取***********
- 1、车辆详情页的链接
- *********二级页面需抓取***********
- 1、名称
- 2、行驶里程
- 3、上牌时间
- 4、档位
- 5、排量
- 6、车辆所在地
- 7、价格
-
二.实现步骤
- 【1】确定响应内容中是否存在所需抓取数据 - 存在
-
- 【2】找URL地址规律
- 第1页: https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp1exx0/
- 第2页: https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp2exx0/
- 第n页: https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp{}exx0/
-
- 获取所有的城市[16]://div[@class="list-line"]
- 一级页面车名[75]://div[@class="card-right"]/h3
- 一级页面年份[75]://div[@class="car-info"]/span[1]
- 一级页面公里[75]://div[@class="car-info"]/span[2]
- 一级页面价格[75]://div[@class="bt-box"]/strong
-
- 【3】 写正则表达式
- 一级页面正则表达式:<li class="cards-li list-photo-li".*?<a href="(.*?)".*?</li>
- 二级页面正则表达式:<div class="car-box">.*?<h3 class="car-brand-name">(.*?)</h3>.*?<ul class="brand-unit-item fn-clear">.*?<li>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<span class="price" id="overlayPrice">¥(.*?)<b>
-
- 【4】代码实现
-
三.代码实现
- """
- 汽车之家二手车信息抓取
- 思路
- 1、一级页面:汽车的链接
- 2、二级页面:具体汽车信息
-
- 建立User-Agent池:防止被网站检测到是爬虫
- 使用fake_useragent模块
- 安装:sudo pip3 install fake_useragent
- 使用:
- from fake_useragent import UserAgent
- UserAgent().random
- """
- import requests
- import re
- import time
- import random
- from fake_useragent import UserAgent
-
- class CarSpider:
- def __init__(self):
- self.url = 'https://www.che168.com/beijing/aparse_html0_0msdgscncgpi1lto1csp{}exx0/'
-
- def get_html(self, url):
- """功能函数1 - 获取html"""
- headers = { 'User-Agent':UserAgent().random }
- html = requests.get(url=url, headers=headers).text
-
- return html
-
- def re_func(self, regex, html):
- """功能函数2 - 正则解析函数"""
- pattern = re.compile(regex, re.S)
- r_list = pattern.findall(html)
-
- return r_list
-
- def parse_html(self, one_url):
- """爬虫逻辑函数"""
- one_html = self.get_html(url=one_url)
- one_regex = '<li class="cards-li list-photo-li".*?<a href="(.*?)".*?</li>'
- href_list = self.re_func(regex=one_regex, html=one_html)
- for href in href_list:
- two_url = 'https://www.che168.com' + href
-
- self.get_car_info(two_url)
-
- time.sleep(random.randint(1,2))
-
- def get_car_info(self, two_url):
- """获取1辆汽车的具体信息"""
- two_html = self.get_html(url=two_url)
- two_regex = '<div class="car-box">.*?<h3 class="car-brand-name">(.*?)</h3>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<span class="price" id="overlayPrice">¥(.*?)<b>'
-
- car_list = self.re_func(regex=two_regex, html=two_html)
- item = {}
- item['name'] = car_list[0][0].strip()
- item['km'] = car_list[0][1].strip()
- item['time'] = car_list[0][2].strip()
- item['type'] = car_list[0][3].split('/')[0].strip()
- item['displace'] = car_list[0][3].split('/')[1].strip()
- item['address'] = car_list[0][4].strip()
- item['price'] = car_list[0][5].strip()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- print(item)
-
- def run(self):
- for i in range(1,5):
- url = self.url.format(i)
- self.parse_html(url)
-
- if __name__ == '__main__':
- spider = CarSpider()
- spider.run()
-
四.数据可持续化实现(存入Mysql)
- create database cardb charset utf8;
- use cardb;
- create table cartab(
- name varchar(100),
- km varchar(50),
- years varchar(50),
- type varchar(50),
- displacement varchar(50),
- city varchar(50),
- price varchar(50)
- )charset=utf8;
-
五.增量爬虫(Redis实现)
- """
- 提示: 使用redis中的集合,sadd()方法,添加成功返回1,否则返回0
- """
-
- import requests
- import re
- import time
- import random
- import pymysql
- from hashlib import md5
- import sys
- import redis
-
-
- class CarSpider(object):
- def __init__(self):
- self.url = 'https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp{}exx0/'
- self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
- self.db = pymysql.connect('localhost','root','123456','cardb',charset='utf8')
- self.cursor = self.db.cursor()
-
- self.r = redis.Redis(host='localhost',port=6379,db=0)
-
-
- def get_html(self,url):
- html = requests.get(url=url,headers=self.headers).text
-
- return html
-
-
- def re_func(self,regex,html):
- pattern = re.compile(regex,re.S)
- r_list = pattern.findall(html)
-
- return r_list
-
-
- def parse_html(self,one_url):
- one_html = self.get_html(one_url)
- one_regex = '<li class="cards-li list-photo-li".*?<a href="(.*?)".*?</li>'
- href_list = self.re_func(one_regex,one_html)
- for href in href_list:
-
- s = md5()
- s.update(href.encode())
- finger = s.hexdigest()
-
- if self.r.sadd('car:urls',finger):
-
- url = 'https://www.che168.com' + href
-
-
- self.get_data(url)
- time.sleep(random.randint(1,2))
- else:
- sys.exit('抓取结束')
-
-
- def go_spider(self,finger):
- sel = 'select * from request_finger where finger=%s'
- result = self.cursor.execute(sel,[finger])
- if result:
- return False
- return True
-
-
- def get_data(self,url):
- two_html = self.get_html(url)
- two_regex = '<div class="car-box">.*?<h3 class="car-brand-name">(.*?)</h3>.*?<ul class="brand-unit-item fn-clear">.*?<li>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<span class="price" id="overlayPrice">¥(.*?)<b'
- item = {}
- car_info_list = self.re_func(two_regex,two_html)
- item['name'] = car_info_list[0][0]
- item['km'] = car_info_list[0][1]
- item['year'] = car_info_list[0][2]
- item['type'] = car_info_list[0][3].split('/')[0]
- item['displacement'] = car_info_list[0][3].split('/')[1]
- item['city'] = car_info_list[0][4]
- item['price'] = car_info_list[0][5]
- print(item)
-
- one_car_list = [
- item['name'],
- item['km'],
- item['year'],
- item['type'],
- item['displacement'],
- item['city'],
- item['price']
- ]
- ins = 'insert into cartab values(%s,%s,%s,%s,%s,%s,%s)'
- self.cursor.execute(ins,one_car_list)
- self.db.commit()
-
- def run(self):
- for p in range(1,2):
- url = self.url.format(p)
- self.parse_html(url)
-
-
- self.cursor.close()
- self.db.close()
-
- if __name__ == '__main__':
- spider = CarSpider()
- spider.run()
-