python爬虫案例--------爬取https://www.qiushibaike.com/用户信息,保存到mysql数据库中
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # @Time : 2018/3/5 23:32
- # @Author : hyang
- # @Site :
- # @File : scrapy_qsbk.py
- # @Software: PyCharm
-
- import requests
- from bs4 import BeautifulSoup
- from requests.exceptions import *
- import pymysql
- import time
- import re
- '''
- 爬取/www.qiushibaike.com
- 热门话题的用户信息,并保存到mysql数据库中
- '''
-
- start_url = 'https://www.qiushibaike.com'
-
- class qsbk(object):
-
- def __init__(self):
- self.session = requests.session() # 包括了cookies信息
- self.headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
- }
- mysql_connect_dict = {
- 'host': '127.0.0.1',
- 'port': 3333,
- 'user': 'jianeng',
- 'password': 'qwe123',
- 'db': 'info',
- 'charset': 'utf8'
- }
- # 连接数据库
- self.conn = pymysql.connect(**mysql_connect_dict)
- self.cursor = self.conn.cursor()
-
- def get_response(self, url):
- try:
- response = self.session.get(url, timeout=1)
- if response.status_code == 200:
- return response.text
- else:
- time.sleep(1)
- return self.get_response(url)
- except ReadTimeout:
- print('ReadTimeout')
- except ConnectionError: # 网络不通
- print('ConnectionError')
- except RequestException:
- print('Error')
-
- # 解析用户url /users/24057284/
- def parse_userurl(self, text):
- soup = BeautifulSoup(text,'lxml')
- # print(soup.prettify())
- author_li = soup.findAll('div', class_="author clearfix")
- url_li = []
- for item in author_li:
- if item.find('a') != None:
- # name = item.find('h2').text
- url = item.find('a').attrs['href']
- url_li.append(url)
- return url_li
-
- # 解析用户数据
- def parse_userdata(self, text):
- soup = BeautifulSoup(text, 'lxml')
-
- if '当前用户已关闭糗百个人动态' in text:
- print('当前用户已关闭')
- return None
- else:
- username = soup.find('h2').text
- result = soup.findAll('div', class_='user-statis')
-
- number = result[0].find_all('li')[0].text
- attentions = result[0].find_all('li')[1].text
- comments = result[0].find_all('li')[3].text
-
- constellation = result[1].find_all('li')[1].text
- occupation = result[1].find_all('li')[2].text
- address = result[1].find_all('li')[3].text
-
- return username, number, attentions, comments, constellation, occupation, address
-
- # 保存到数据库中
- def save_mydata(self, data):
- # print (data)
- if data != None:
- sql = 'insert into qsbk_user (username,num,attentions,comments,constellation,occupation,address) VALUES (%s,%s,%s,%s,%s,%s,%s)'
- li = [item.split(":")[-1] for item in data]
- # print('data=',li) # data= ['绠纱猫猫', '16', '3', '297', '天蝎座', '家里蹲', '湖南 · 长沙']
- try:
- self.cursor.execute(sql, tuple(li))
- self.conn.commit()
- except Exception as e:
- print(e)
-
- def main(self,url):
- response = self.get_response(url)
- try:
- url_li = self.parse_userurl(response)
- for item in url_li:
- user_detail_url = url+item
- data = self.parse_userdata(self.get_response(user_detail_url))
- self.save_mydata(data)
- except IndexError as e:
- print(e)
- except Exception as e:
- print(e)
- if __name__ == '__main__':
- qsbk().main(start_url)