创建项目
scrapy startproject xinlang
创建爬虫文件spider
cd xinlang
scrapy genspider xinlangnews roll.news.sina.com.cn
创建调试文件
cd ..
import scrapy.cmdline
#无日志
scrapy.cmdline.execute('scrapy crawl xinlangnews --nolog'.split())
#有日志
#scrapy.cmdline.execute('scrapy crawl xinlangnews'.split())
修改settings.py中爬虫协议
ROBOTSTXT_OBEY = False
在xinlangnews.py中(刚才创建的爬虫文件)
from xinlang.items import XinlangItem
import scrapy
class XinlangnewsSpider(scrapy.Spider):
name = 'xinlangnews'
allowed_domains = ['roll.news.sina.com.cn']
start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_1.shtml']
def parse(self, response):
news_list = response.xpath('//ul[@class="list_009"]/li')
# news_list = response.css('.list_009 > li') css选择器解析
for new in news_list:
new_title = new.xpath('./a/text()').get()
new_time = new.xpath('./span/text()').get()
item = XinlangItem()
item['newstitle'] = new_title
item['newstime'] = new_time
yield item #需要进入管道则yield
在items.py中
import scrapy
class XinlangItem(scrapy.Item):
newstitle = scrapy.Field()
newstime = scrapy.Field()
进入管道配置(settings.py)
65行代码:
ITEM_PIPELINES = {
'xinlang.pipelines.XinlangPipeline': 300,
}
在piplines.py中
from itemadapter import ItemAdapter
import pymysql
class XinlangPipeline:
# 爬虫开始
def open_spider(self, spider):
print('开始')
# 连接MySQL
self.db = pymysql.connect(
host='公网地址', port=3306,
user='root', password='数据库密码',
database='spider88', charset='utf8'
)
self.cursor = self.db.cursor()
# 爬虫结束
def close_spider(self, spider):
print('结束')
self.cursor.close()
self.db.close()
def process_item(self, item, spider):
# print(spider.name) # 这个类中的spider必须有这个形参,不然报错,这是传过来的参数
print(f'---- {item} ----')
news_title = item['newstitle']
news_time = item['newstime']
# sql语句
sql = 'insert into xinlangnews(newstitle, newtime) values("%s","%s")' \
% (news_title, news_time)
# 执行和提交sql
self.cursor.execute(sql)
self.db.commit()
return item