scrapy的pipeline是一个非常重要的模块,主要作用是将return的items写入到数据库、文件等持久化模块,下面我们就简单的了解一下pipelines的用法。
案例一:
items池
class ZhihuuserItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
id = scrapy.Field()
name = scrapy.Field()
avatar_url = scrapy.Field()
headline = scrapy.Field()
description = scrapy.Field()
url = scrapy.Field()
url_token = scrapy.Field()
gender = scrapy.Field()
cover_url = scrapy.Field()
type = scrapy.Field()
badge = scrapy.Field()
answer_count = scrapy.Field()
articles_count = scrapy.Field()
commercial_question = scrapy.Field()
favorite_count = scrapy.Field()
favorited_count = scrapy.Field()
follower_count = scrapy.Field()
following_columns_count = scrapy.Field()
following_count = scrapy.Field()
pins_count = scrapy.Field()
question_count = scrapy.Field()
thank_from_count = scrapy.Field()
thank_to_count = scrapy.Field()
thanked_count = scrapy.Field()
vote_from_count = scrapy.Field()
vote_to_count = scrapy.Field()
voteup_count = scrapy.Field()
following_favlists_count = scrapy.Field()
following_question_count = scrapy.Field()
following_topic_count = scrapy.Field()
marked_answers_count = scrapy.Field()
mutual_followees_count = scrapy.Field()
participated_live_count = scrapy.Field()
locations = scrapy.Field()
educations = scrapy.Field()
employments = scrapy.Field()
写入MongoDB数据库的基本配置
#配置MongoDB数据库的连接信息
MONGO_URL = '172.16.5.239'
MONGO_PORT = 27017
MONGO_DB = 'zhihuuser'
#参数等于False,就等于告诉你这个网站你想取什么就取什么,不会读取每个网站的根目录下的禁止爬取列表(例如:www.baidu.com/robots.txt)
ROBOTSTXT_OBEY = False
执行pipelines下的写入操作
ITEM_PIPELINES = {
'zhihuuser.pipelines.MongoDBPipeline': 300,
}
pipelines.py:
1、首先我们要从settings文件中读取数据的地址、端口、数据库名称(没有会自动创建)。
2、拿到数据库的基本信息后进行连接。
3、将数据写入数据库
4、关闭数据库
注意:只有打开和关闭是只执行一次,而写入操作会根据具体的写入次数而定。
import pymongo
class MongoDBPipeline(object):
"""
1、连接数据库操作
"""
def __init__(self,mongourl,mongoport,mongodb):
'''
初始化mongodb数据的url、端口号、数据库名称
:param mongourl:
:param mongoport:
:param mongodb:
'''
self.mongourl = mongourl
self.mongoport = mongoport
self.mongodb = mongodb
@classmethod
def from_crawler(cls,crawler):
"""
1、读取settings里面的mongodb数据的url、port、DB。
:param crawler:
:return:
"""
return cls(
mongourl = crawler.settings.get("MONGO_URL"),
mongoport = crawler.settings.get("MONGO_PORT"),
mongodb = crawler.settings.get("MONGO_DB")
)
def open_spider(self,spider):
'''
1、连接mongodb数据
:param spider:
:return:
'''
self.client = pymongo.MongoClient(self.mongourl,self.mongoport)
self.db = self.client[self.mongodb]
def process_item(self,item,spider):
'''
1、将数据写入数据库
:param item:
:param spider:
:return:
'''
name = item.__class__.__name__
# self.db[name].insert(dict(item))
self.db['user'].update({'url_token':item['url_token']},{'$set':item},True)
return item
def close_spider(self,spider):
'''
1、关闭数据库连接
:param spider:
:return:
'''
self.client.close()