爬虫--Scrapy(实战多item多表存储)

时间：08-16来源：作者：点击数：27
#settings.py
BOT_NAME = 'xpc'

SPIDER_MODULES = ['xpc.spiders']
NEWSPIDER_MODULE = 'xpc.spiders'


USER_AGENT = 'xpc (+http://www.yourdomain.com)'


ROBOTSTXT_OBEY = False


DOWNLOAD_DELAY = 1


ITEM_PIPELINES = {
   'xpc.pipelines.XpcPipeline': 300,
}





#爬虫文件.py中
import os
import re

import scrapy

from xpc.items import *


class MyxpcSpider(scrapy.Spider):
    name = 'myxpc'
    allowed_domains = ['xinpianchang.com']
    start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']

    # 主页数据
    def parse(self, response, **kwargs):
        print('*' * 60)
        print(len(response.text))

        # 视频列表
        li_list = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li')

        for li in li_list:
            # 作品id
            pid = li.xpath('./@data-articleid').get()
            # 作品标题
            title = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/a/p/text()').get()
            # 视频缩略图
            thumbnail = li.xpath('./a/img/@_src').get()
            # 作品类型
            category = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').get()
            # 发表时间
            created_at = li.xpath('./a/div[@class="video-hover-con"]/p/text()').get()
            # 作品描述
            description = li.xpath('./a/div[@class="video-hover-con"]/div/text()').get()
            # 被点赞次数
            like_counts = li.xpath('./div[@class="video-con"]//span[@class="fw_300 c_b_9 icon-like"]/text()').get()

            # print(pid, title, thumbnail, category, created_at, description, like_counts)

            # item
            post_item = PostItem()
            post_item['pid'] = pid
            post_item['title'] = title
            post_item['thumbnail'] = thumbnail
            post_item['category'] = category

            post_item['created_at'] = created_at
            post_item['description'] = description
            post_item['like_counts'] = like_counts

            # 视频详情页
            post_url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
            # 请求详情页数据
            request = scrapy.Request(url=post_url, callback=self.post_detail)
            request.meta['post_item'] = post_item
            yield request

    # 视频详情页
    def post_detail(self, response):
        post_item = response.meta.get('post_item')
        pid = post_item['pid']

        # 播放次数
        play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
        post_item['play_counts'] = play_counts

        # 视频
        # vid
        vid = re.findall('vid = "(.*?)"', response.text)[0]
        # print(vid)
        # 视频url
        video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77' % vid
        # print(video_url)

        # 请求视频数据
        request = scrapy.Request(url=video_url, callback=self.vido_detail)
        request.meta['post_item'] = post_item
        yield request

        # 获取创作者的数据
        li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li')
        for li in li_list:
            cid = li.xpath('./a/@data-userid').get()  # 作者id

            # item
            composer_item = ComposersItem()
            composer_item['cid'] = cid

            # 进入到创作者的个人主页去获取其他数据
            href = li.xpath('./a/@href').get()
            href = "https://www.xinpianchang.com/" + href
            request2 = scrapy.Request(url=href, callback=self.composer_detail)
            request2.meta['composer_item'] = composer_item
            yield request2


            # 版权copyright
            copyrights_item = CopyrightsItem()
            copyrights_item['pcid'] = f'{pid}_{cid}'
            copyrights_item['pid'] = pid
            copyrights_item['cid'] = cid
            roles = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
            copyrights_item['roles'] = roles
            yield copyrights_item


        # 评论数据
        comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24' % pid
        request3 = scrapy.Request(url=comment_url, callback=self.comment_detail)
        yield request3

    # 视频数据
    def vido_detail(self, response):
        post_item = response.meta.get('post_item')

        # json解析
        content = response.json()

        # 播放时长
        duration = content['data']['duration']
        # 视频预览图
        preview = content['data']['cover']
        # 视频链接
        video = content['data']['resource']['progressive'][0]['url']
        # 视频格式
        video_format = content['data']['resource']['progressive'][0]['mime']


        # item
        post_item['duration'] = duration
        post_item['preview'] = preview
        post_item['video'] = video
        post_item['video_format'] = video_format
        yield post_item

    # 创作者数据
    def composer_detail(self, response):
        composer_item = response.meta['composer_item']

        # banner图
        banner_style = response.xpath('//div[@class="banner-wrap"]/@style').get()
        # print(banner_style)
        # background-image:url(https://cs.xinpianchang.com/app/user/html/stars/css/skin46/bg-big.jpg)
        banner = banner_style[banner_style.find("https"): -1]
        # print(banner)

        # 头像
        avatar = response.xpath('//span[@class="avator-wrap-s"]/img/@src').get()

        # 是否加V
        verified = response.xpath('//span[@class="avator-wrap-s"]/span[contains(@class, "author-v")]').get()
        # print('verified: ', verified)
        verified = 'yes' if verified else 'no'

        # 名字
        name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
        # print(name)

        # 自我介绍
        intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()

        # 被点赞次数
        like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
        like_counts = like_counts.replace(',', "")

        # 被关注数量
        fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
        fans_counts = fans_counts.replace(',', "")

        # 关注数量
        follow_counts = response.xpath('//span[@class="fw_600 v-center"]/text()').get()
        follow_counts = follow_counts.replace(',', "")

        # 所在位置
        location = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][1]/text()').get()

        # 职业
        career = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][2]/text()').get()

        composer_item['banner'] = banner
        composer_item['avatar'] = avatar
        composer_item['verified'] = verified
        composer_item['name'] = name
        composer_item['intro'] = intro
        composer_item['like_counts'] = like_counts
        composer_item['fans_counts'] = fans_counts
        composer_item['follow_counts'] = follow_counts
        composer_item['location'] = location
        composer_item['career'] = career
        yield composer_item

        # print(composer_item)

    # 评论数据
    def comment_detail(self, response):
        # 获取评论数据
        content = response.json()
        comment_list = content['data']['list']
        for comment in comment_list:
            item = CommentsItem()
            item['commentid'] = comment['id']
            item['pid'] = comment['resource_id']
            item['cid'] = comment['userid']
            item['avatar'] = comment['userInfo']['avatar']
            item['uname'] = comment['userInfo']['username']
            item['created_at'] = comment['addtime']
            item['content'] = comment['content']
            item['like_counts'] = comment['count_approve']
            if comment.get('referer'):
                item['reply'] = comment['referer']['id']
            else:
                item['reply'] = '0'

            yield item
            # print(item)




#items.py中
from scrapy import Item, Field


# 作品(短视频)
class PostItem(Item):
    table_name = 'posts'
    pid = Field()  # 作品表主键
    title = Field()  # 作品标题
    thumbnail = Field()  # 视频缩略图
    category = Field()  # 作品分类

    created_at = Field()  # 发表时间
    description = Field()  # 作品描述
    like_counts = Field()  # 被点赞次数

    duration = Field()  # 播放时长
    preview = Field()  # 视频预览图
    video = Field()  # 视频链接
    video_format = Field()  # 视频格式
    play_counts = Field()  # 播放次数


# 作者
class ComposersItem(Item):
    table_name = 'composers'
    cid = Field()  # 创作者表主键
    banner = Field()  # 用户主页banner图片
    avatar = Field()  # 用户头像
    verified = Field()  # 是否加V

    name = Field()  # 名字
    intro = Field()  # 自我介绍
    like_counts = Field()  # 被点赞次数
    fans_counts = Field()  # 被关注数量

    follow_counts = Field()  # 关注数量
    location = Field()  # 所在位置
    career = Field()  # 职业


# 评论
class CommentsItem(Item):
    table_name = 'comments'
    commentid = Field()  # 评论表主键
    pid = Field()  # 评论的作品ID

    cid = Field()  # 评论人ID
    avatar = Field()  # 评论人头像
    uname = Field()  # 评论人名称

    created_at = Field()  # 发表时间
    content = Field()  # 评论内容
    like_counts = Field()  # 被点赞次数
    reply = Field()  # 回复其他评论的ID，如果不是则为0


# 版权
class CopyrightsItem(Item):
    table_name = 'copyrights'
    pcid = Field()  # 主键，由pid_cid组成
    pid = Field()  # 对应作品表主键
    cid = Field()  # 对应作者表主键
    roles = Field()  # 担任角色







#pipelines.py中(核心)
import os
from itemadapter import ItemAdapter	
import pymysql
from urllib import request

class XpcPipeline:

    # 开启爬虫时: 连接MySQL
    def open_spider(self, spider):
        self.db = pymysql.connect(
            host='localhost', port=3306,
            user='root', password='root',
            database='xpc_2020', charset='utf8mb4'
        )
        self.cursor = self.db.cursor()

    # 关闭爬虫时: 关闭连接MySQL
    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()

    # 处理数据
    def process_item(self, item, spider):

        table_name = item.table_name
        keys = list(item.keys())  # ['pcid', 'pid', 'cid', 'roles']
        values = list(item.values())  # ['333', '222', '111', '制作方']

        # 所有字段组成的字符串
        key_str = ','.join(['`%s`'%k for k in keys])
        # print(key_str)  # "`pcid`,`pid`,`cid`,`roles`"

        # 值组成的字符串
        values_str = ','.join(["%s"] * len(values))
        print(values_str)  # "%s,%s,%s,%s"

        # update字符串
        update_str = ','.join(["`{}`=%s".format(k) for k in keys])
        print(update_str)  # "`pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s"

        # SQL
        sql = 'insert into `{}`({}) values({}) on duplicate key update {}'.format(
            table_name,
            key_str,
            values_str,
            update_str
        )
        # print(sql)
        # insert into `copyrights`(`pcid`,`pid`,`cid`,`roles`)
        #    values(%s,%s,%s,%s)
        #    on duplicate key update `pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s

        # 执行SQL
        self.cursor.execute(sql, values*2)
        # values * 2 =
        # ['333', '222', '111', '制作方', '333', '222', '111', '制作方']
        self.db.commit()
        print(f'----- 插入成功: {table_name} -----')

        # on duplicate key update
        # 作用: 当向数据库插入新记录时:
        #    1. 如果表中的主键或唯一约束对应的值已经存在,则执行更新操作
        #    2. 否则就插入新记录,不更新

        return item