python 爬取新浪国内新闻含获取js里的数据
#encoding utf-8
import requests
import re
from datetime import datetime
import json
from bs4 import BeautifulSoup
new_urls = set() #存放未访问url set集合
#根据得到的url获取新闻信息
def get_soup(url):
res = requests.get(url)
res.encoding = "utf-8"
soup = BeautifulSoup(res.text,"html.parser")
return soup
#添加新的url
def add_urls(arr):
for link in arr:
if(len(link.select('h2'))>0):
h2 = link.select('h2')[0]
a = h2.select('a')
if (len(a) > 0):
a = a[0]
time = link.select('.time')
if(len(time)>0):
#print(h2.text,time[0].text,a['href'])
new_urls.add(a['href'])
#从new_urls获取未访问的url
def get_url():
if(new_urls is not None):
return new_urls.pop()
#通过新闻页url获取id,并拼接返回存放评论数
def get_commentsJs(url):
m = re.search('doc-i(.+).shtml',url)
m.group(1)
url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}"
comments = requests.get(url.format(m.group(1))) #url.format(m.group(1))url拼接
comments.encoding = 'utf-8'
jd = json.loads(comments.text.strip('var data=')) # 移除改内容将其变为json数据
return jd['result']['count']['total']
#获取新闻内容中所需内容
def get_information(soup,url):
dict = {}
title = soup.select_one('#artibodyTitle')
dict['title'] = title.text
time_source = soup.select_one('.time-source')
time = time_source.contents[0].strip() # str 格式
dict['site'] = time_source.contents[1].text.strip('\n') #新闻来源
dict['time'] = datetime.strptime(time, '%Y年%m月%d日%H:%M') # 将字符串转化为时间类型
content = ' '.join([p.text.strip() for p in soup.select('#artibody p')[0:-1]]) # 生成器写法
# ' '.join(content) #将其合并为字符串
dict['content'] = content
# 取出责任编辑
editor = soup.select_one('.article-editor').text.lstrip('责任编辑:') # 并从左边将责任编辑:移除
dict['editor'] = editor
dict['comments'] = get_commentsJs(url)
return dict
#国内新闻页面所有新闻
root_url = "http://news.sina.com.cn/china/"
soup = get_soup(root_url)
items = soup.select('.news-item')
add_urls(items) #将新获取的url加入集合
content = []
while 1:
if (new_urls is None):
break
url = get_url() #从url集合中得到要访问的url
soup = get_soup(url) #得到soup
dict = get_information(soup,url)
content.append(dict)
print(dict)