scrapy获取气象预警
qxyj.py:
# -*- coding: utf-8 -*-
import datetime,time
from bs4 import BeautifulSoup
import requests
import scrapy
from QXYJ.items import QxyjItem
class QxyjSpider(scrapy.Spider):
name = 'qxyj'
allowed_domains = ['weather.com.cn']
# start_urls = ['http://www.weather.com.cn/alarm/newalarmlist.shtml?level=3']
start_urls = ['http://product.weather.com.cn/alarm/grepalarm_cn.php']
def parse(self, response):
items = []
# 获取发布列表
result_ = response.text[:-1]
result = result_.split("=")[-1]
# 注意:eval转换的时候不能有null,否则会报错,将null赋值为空
null = ''
result = eval(result)
data = result['data']
length = len(data)
# 今天的日期
today = datetime.datetime.now().strftime("%Y%m%d")
today2 = datetime.datetime.now().strftime("%Y-%m-%d")
# 内容地址的前半部分
start_url = 'http://product.weather.com.cn/alarm/webdata/'
if length>30:
for i in range(0,30):
item = QxyjItem()
# 101100503-20210507103742-6801.html
end_url = data[i][1]
# 获取中间的日期
pub_time = end_url.split("-")[1][:8]
if pub_time == today:
# 由于是js动态加载的不能直接获取
# 获取地址http://www.weather.com.cn/alarm/newalarmcontent.shtml?file=1011201-20210507091026-0501.html
item['news_url'] = start_url + end_url
item['pub_time'] = today2
items.append(item)
else:
for i in range(0, length):
item = QxyjItem()
# 101100503-20210507103742-6801.html
end_url = data[i][1]
# 获取中间的日期
pub_time = end_url.split("-")[1][:8]
if pub_time == today:
# 由于是js动态加载的不能直接获取
# 获取地址http://www.weather.com.cn/alarm/newalarmcontent.shtml?file=1011201-20210507091026-0501.html
item['news_url'] = start_url + end_url
item['pub_time'] = today2
items.append(item)
for item in items:
time.sleep(1)
yield scrapy.Request(url=item['news_url'], meta={'meta_1': item}, callback=self.parse_news)
def parse_news(self, response):
item = QxyjItem()
meta_1 = response.meta['meta_1']
result = response.text
start_content='<span style="color:#222222;font-family:"microsoftyahei";white-space:pre-wrap;font-size:16px;"> '
end_content = '</span>'
result = result.split("=")[-1]
result = eval(result)
news_title = result['head']
content = result['ISSUECONTENT']
key_word = result['SIGNALTYPE']
item['pub_time'] = meta_1['pub_time']
item['news_title'] = news_title
item['source'] = '中国天气网'
item['key_word'] = key_word
item['is_pub'] = '否'
item['content'] = start_content + content + end_content
if len(content) > 150:
item['news_guide'] = content[:100] + "......"
else:
item['news_guide'] = content
yield item
items.py:
import scrapy
class QxyjItem(scrapy.Item):
# define the fields for your item here like:
# 标题
news_title = scrapy.Field()
# 来源
source = scrapy.Field()
# 关键字
key_word = scrapy.Field()
# 文章导读
news_guide = scrapy.Field()
# 正文
content = scrapy.Field()
# 文章地址
news_url = scrapy.Field()
# 发布时间
pub_time = scrapy.Field()
# 上传情况
is_pub = scrapy.Field()
pipelines.py:
import json,time
from openpyxl import Workbook
# 转码操作,继承json.JSONEncoder的子类
class MyEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, bytes):
return str(o, encoding='utf-8')
return json.JSONEncoder.default(self, o)
class QxyjPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
# 创建表头
self.ws.append(['标题', '来源', '关键字', '文章导读', '正文', '发布时间','是否上传'])
def process_item(self, item, spider):
text = [item['news_title'], item['source'], item['key_word'], item['news_guide'],
item['content'], item['pub_time'],item['is_pub']]
self.ws.append(text)
return item
def close_spider(self, spider):
time_file = time.strftime("%Y-%m-%d", time.localtime())
self.wb.save('qxyj'+time_file+'.xlsx')
print("数据处理完毕,谢谢使用!")
settings.py:
ITEM_PIPELINES = {
'QXYJ.pipelines.QxyjPipeline': 300,
}
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# 还可以将日志存到本地文件中(可选添加设置)
LOG_FILE = "qxyj.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True
# Obey robots.txt rules
ROBOTSTXT_OBEY = False