python 爬取新浪国内新闻含获取js里的数据
- #encoding utf-8
- import requests
- import re
- from datetime import datetime
- import json
- from bs4 import BeautifulSoup
- new_urls = set() #存放未访问url set集合
- #根据得到的url获取新闻信息
- def get_soup(url):
- res = requests.get(url)
- res.encoding = "utf-8"
- soup = BeautifulSoup(res.text,"html.parser")
- return soup
-
- #添加新的url
- def add_urls(arr):
- for link in arr:
- if(len(link.select('h2'))>0):
- h2 = link.select('h2')[0]
- a = h2.select('a')
- if (len(a) > 0):
- a = a[0]
- time = link.select('.time')
- if(len(time)>0):
- #print(h2.text,time[0].text,a['href'])
- new_urls.add(a['href'])
-
- #从new_urls获取未访问的url
- def get_url():
- if(new_urls is not None):
- return new_urls.pop()
-
- #通过新闻页url获取id,并拼接返回存放评论数
- def get_commentsJs(url):
- m = re.search('doc-i(.+).shtml',url)
- m.group(1)
- url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}"
- comments = requests.get(url.format(m.group(1))) #url.format(m.group(1))url拼接
- comments.encoding = 'utf-8'
- jd = json.loads(comments.text.strip('var data=')) # 移除改内容将其变为json数据
- return jd['result']['count']['total']
-
- #获取新闻内容中所需内容
- def get_information(soup,url):
- dict = {}
- title = soup.select_one('#artibodyTitle')
- dict['title'] = title.text
- time_source = soup.select_one('.time-source')
- time = time_source.contents[0].strip() # str 格式
- dict['site'] = time_source.contents[1].text.strip('\n') #新闻来源
- dict['time'] = datetime.strptime(time, '%Y年%m月%d日%H:%M') # 将字符串转化为时间类型
- content = ' '.join([p.text.strip() for p in soup.select('#artibody p')[0:-1]]) # 生成器写法
- # ' '.join(content) #将其合并为字符串
- dict['content'] = content
- # 取出责任编辑
- editor = soup.select_one('.article-editor').text.lstrip('责任编辑:') # 并从左边将责任编辑:移除
- dict['editor'] = editor
- dict['comments'] = get_commentsJs(url)
- return dict
-
-
- #国内新闻页面所有新闻
- root_url = "http://news.sina.com.cn/china/"
- soup = get_soup(root_url)
- items = soup.select('.news-item')
- add_urls(items) #将新获取的url加入集合
- content = []
-
- while 1:
- if (new_urls is None):
- break
- url = get_url() #从url集合中得到要访问的url
- soup = get_soup(url) #得到soup
- dict = get_information(soup,url)
- content.append(dict)
- print(dict)