创建项目
- scrapy startproject xinlang
-
创建爬虫文件spider
- cd xinlang
- scrapy genspider xinlangnews roll.news.sina.com.cn
-
创建调试文件
- cd ..
- import scrapy.cmdline
-
- #无日志
- scrapy.cmdline.execute('scrapy crawl xinlangnews --nolog'.split())
-
- #有日志
- #scrapy.cmdline.execute('scrapy crawl xinlangnews'.split())
-
修改settings.py中爬虫协议
- ROBOTSTXT_OBEY = False
-
在xinlangnews.py中(刚才创建的爬虫文件)
- from xinlang.items import XinlangItem
- import scrapy
-
- class XinlangnewsSpider(scrapy.Spider):
- name = 'xinlangnews'
- allowed_domains = ['roll.news.sina.com.cn']
- start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_1.shtml']
-
- def parse(self, response):
- news_list = response.xpath('//ul[@class="list_009"]/li')
- # news_list = response.css('.list_009 > li') css选择器解析
- for new in news_list:
- new_title = new.xpath('./a/text()').get()
- new_time = new.xpath('./span/text()').get()
- item = XinlangItem()
- item['newstitle'] = new_title
- item['newstime'] = new_time
- yield item #需要进入管道则yield
-
在items.py中
- import scrapy
- class XinlangItem(scrapy.Item):
- newstitle = scrapy.Field()
- newstime = scrapy.Field()
-
进入管道配置(settings.py)
- 65行代码:
- ITEM_PIPELINES = {
- 'xinlang.pipelines.XinlangPipeline': 300,
- }
-
在piplines.py中
- from itemadapter import ItemAdapter
- import pymysql
-
- class XinlangPipeline:
-
- # 爬虫开始
- def open_spider(self, spider):
- print('开始')
- # 连接MySQL
- self.db = pymysql.connect(
- host='公网地址', port=3306,
- user='root', password='数据库密码',
- database='spider88', charset='utf8'
- )
- self.cursor = self.db.cursor()
-
- # 爬虫结束
- def close_spider(self, spider):
- print('结束')
- self.cursor.close()
- self.db.close()
-
- def process_item(self, item, spider):
- # print(spider.name) # 这个类中的spider必须有这个形参,不然报错,这是传过来的参数
-
- print(f'---- {item} ----')
-
- news_title = item['newstitle']
- news_time = item['newstime']
-
- # sql语句
- sql = 'insert into xinlangnews(newstitle, newtime) values("%s","%s")' \
- % (news_title, news_time)
-
- # 执行和提交sql
- self.cursor.execute(sql)
- self.db.commit()
-
- return item
-