爬取链接:https://wf.zu.anjuke.com/?from=navigation
首先我们先打开浏览器,输入网址,利用network进行抓包,找到type ducument响应源码文件
我们发现我们要爬取的数据在html源码中,那么我们就方便了,使用requests和lxml配合使用爬取租房信息,采用面向对象进行开发,方便代码修改,然后我们编写代码!!!
# -- coding: utf-8 --
# @Time : 2020/12/28 15:40
# @FileName: Anjuke.py
# @Software: PyCharm
import requests
from lxml import etree
import csv
from pymongo import MongoClient
class Anjuke(object):
def __init__(self):
self.url = 'https://wf.zu.anjuke.com/?from=navigation'
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.88 Safari/537.36 ',
'referer': 'https://weifang.anjuke.com/'
}
self.client = MongoClient("127.0.0.1", 27017)
self.db = self.client['Anjuke']
self.col = self.db['House']
def get_data(self):
proxies_url = 'http://webapi.http.zhimacangku.com/getip?num=20&type=2&pro=&city=0&yys=0&port=1&pack=131877&ts' \
'=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions= '
res = requests.get(proxies_url)
dict_data = json.loads(res.content)
for agent in dict_data["data"]:
ip = agent['ip']
port = agent['port']
# 代理服务器
proxyHost = ip
proxyPort = port
proxyMeta = "http://%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
}
self.proxies = {
"http": proxyMeta
}
response = requests.get(url=self.url, proxies=self.proxies)
return response
def parse_data(self, response):
html = etree.HTML(response.content)
node_list = html.xpath('//div[@class="zu-itemmod"]')
data_list = list()
for node in node_list:
temp = {}
temp["标题"] = node.xpath("./div[1]/h3/a/b/text()")[0]
temp["链接"] = node.xpath("./div[1]/h3/a/@href")[0]
temp["价格"] = node.xpath("./div[2]/p/strong/b/text()")[0] + '元'
temp["大小"] = node.xpath("./div[1]/p[1]/b[3]/text()")[0] + '平米'
shi = node.xpath("./div[1]/p[1]/b[1]/text()")[0]
ting = node.xpath("./div[1]/p[1]/b[2]/text()")[0]
temp["户型"] = shi + '室' + ting + '厅'
temp["姓名"] = node.xpath("./div[1]/p[1]/text()[6]")[0].strip()
temp["小区"] = node.xpath("./div[1]/address/a/text()")[0]
temp["地址"] = node.xpath("./div[1]/address/text()")[1].strip()
temp["整租"] = node.xpath("./div[1]/p[2]/span[1]/text()")[0]
temp["方向"] = node.xpath("./div[1]/p[2]/span[2]/text()")[0]
detail_link = temp["链接"]
self.url = detail_link
print(self.url)
response = requests.get(url=self.url, headers=self.headers)
html = etree.HTML(response.content)
try:
temp["要求"] = html.xpath('/html/body/div[3]/div[2]/div[1]/ul[1]/li[1]/span[2]/text()')[0]
except:
temp["要求"] = "空"
try:
temp["描述"] = html.xpath('/html/body/div[3]/div[2]/div[1]/div[6]/b/text()')[0]
except:
temp["描述"] = "空"
try:
temp["图片"] = html.xpath('//*[@id="room_pic_wrap"]/div/img/@data-src')[0]
except:
temp["图片"] = "空"
try:
temp["日期"] = html.xpath('/html/body/div[3]/div[2]/div[1]/div[2]/div/b/text()')[0]
except:
temp["日期"] = "空"
print(temp)
data_list.append(temp)
return data_list
def save_data(self, data_list):
for data in data_list:
csv_writer.writerow(
[data["标题"], data["链接"], data["价格"], data["大小"], data["户型"], data["姓名"], data["小区"], data["地址"],
data["整租"], data["方向"], data["要求"], data["描述"], data["图片"], data["日期"]])
self.col.insert_one(data)
self.client.close()
def run(self):
while True:
response = self.get_data()
data_list = self.parse_data(response)
self.save_data(data_list)
html = etree.HTML(response.content)
try:
next_url = html.xpath('//*[contains(text(),"下一页")]/@href')[0]
print(next_url)
self.url = next_url
except:
break
if __name__ == '__main__':
head = ["标题", "链接", "价格", "大小", "户型", "姓名", "小区", "地址", "整租", "方向", "要求", "描述", "图片", "日期"]
with open('安居客.csv', 'w', newline='', encoding="gb18030") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(head)
anjuke = Anjuke()
anjuke.run()
将数据存储到csv中,并存储到mongodb数据库,并且实现了翻页功能,其中还添加了代理,因为在打开详情页时访问频繁被屏蔽403,利用代理ip进行爬取!
我爬取的是潍坊,如果想爬其他的地方,wf换成你想爬的城市的缩写就行
希望大家多多给我点赞呀!