python 爬取新浪国内新闻含获取js里的数据

时间：01-26来源：作者：点击数：32

#encoding utf-8
import requests
import re
from datetime import datetime
import json
from bs4 import BeautifulSoup
new_urls = set() #存放未访问url set集合
#根据得到的url获取新闻信息
def get_soup(url):
    res = requests.get(url)
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text,"html.parser")
    return soup

#添加新的url
def add_urls(arr):
    for link in arr:
        if(len(link.select('h2'))>0):
            h2 = link.select('h2')[0]
            a = h2.select('a')
            if (len(a) > 0):
                a = a[0]
                time = link.select('.time')
                if(len(time)>0):
                    #print(h2.text,time[0].text,a['href'])
                    new_urls.add(a['href'])

#从new_urls获取未访问的url
def get_url():
    if(new_urls is not None):
        return new_urls.pop()

#通过新闻页url获取id，并拼接返回存放评论数
def get_commentsJs(url):
    m = re.search('doc-i(.+).shtml',url)
    m.group(1)
    url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}"
    comments = requests.get(url.format(m.group(1)))  #url.format(m.group(1))url拼接
    comments.encoding = 'utf-8'
    jd = json.loads(comments.text.strip('var data='))  # 移除改内容将其变为json数据
    return jd['result']['count']['total']

#获取新闻内容中所需内容
def get_information(soup,url):
    dict = {}
    title = soup.select_one('#artibodyTitle')
    dict['title'] = title.text
    time_source = soup.select_one('.time-source')
    time = time_source.contents[0].strip()  # str 格式
    dict['site'] = time_source.contents[1].text.strip('\n') #新闻来源
    dict['time'] = datetime.strptime(time, '%Y年%m月%d日%H:%M')  # 将字符串转化为时间类型
    content = ' '.join([p.text.strip() for p in soup.select('#artibody p')[0:-1]])  # 生成器写法
    # ' '.join(content)  #将其合并为字符串
    dict['content'] = content
    # 取出责任编辑
    editor = soup.select_one('.article-editor').text.lstrip('责任编辑：')  # 并从左边将责任编辑：移除
    dict['editor'] = editor
    dict['comments'] = get_commentsJs(url)
    return dict


#国内新闻页面所有新闻
root_url = "http://news.sina.com.cn/china/"
soup = get_soup(root_url)
items = soup.select('.news-item')
add_urls(items) #将新获取的url加入集合
content = []

while 1:
    if (new_urls is  None):
        break
    url = get_url() #从url集合中得到要访问的url
    soup = get_soup(url) #得到soup
    dict = get_information(soup,url)
    content.append(dict)
    print(dict)