2025年3月16日 星期日 甲辰(龙)年 月十五 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

python爬虫案例--------爬取https://www.qiushibaike.com/用户信息,保存到mysql数据库中

时间:09-16来源:作者:点击数:21

python爬虫案例--------爬取https://www.qiushibaike.com/用户信息,保存到mysql数据库中

  • #!/usr/bin/env python
  • # -*- coding: utf-8 -*-
  • # @Time : 2018/3/5 23:32
  • # @Author : hyang
  • # @Site :
  • # @File : scrapy_qsbk.py
  • # @Software: PyCharm
  • import requests
  • from bs4 import BeautifulSoup
  • from requests.exceptions import *
  • import pymysql
  • import time
  • import re
  • '''
  • 爬取/www.qiushibaike.com
  • 热门话题的用户信息,并保存到mysql数据库中
  • '''
  • start_url = 'https://www.qiushibaike.com'
  • class qsbk(object):
  • def __init__(self):
  • self.session = requests.session() # 包括了cookies信息
  • self.headers = {
  • "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
  • }
  • mysql_connect_dict = {
  • 'host': '127.0.0.1',
  • 'port': 3333,
  • 'user': 'jianeng',
  • 'password': 'qwe123',
  • 'db': 'info',
  • 'charset': 'utf8'
  • }
  • # 连接数据库
  • self.conn = pymysql.connect(**mysql_connect_dict)
  • self.cursor = self.conn.cursor()
  • def get_response(self, url):
  • try:
  • response = self.session.get(url, timeout=1)
  • if response.status_code == 200:
  • return response.text
  • else:
  • time.sleep(1)
  • return self.get_response(url)
  • except ReadTimeout:
  • print('ReadTimeout')
  • except ConnectionError: # 网络不通
  • print('ConnectionError')
  • except RequestException:
  • print('Error')
  • # 解析用户url /users/24057284/
  • def parse_userurl(self, text):
  • soup = BeautifulSoup(text,'lxml')
  • # print(soup.prettify())
  • author_li = soup.findAll('div', class_="author clearfix")
  • url_li = []
  • for item in author_li:
  • if item.find('a') != None:
  • # name = item.find('h2').text
  • url = item.find('a').attrs['href']
  • url_li.append(url)
  • return url_li
  • # 解析用户数据
  • def parse_userdata(self, text):
  • soup = BeautifulSoup(text, 'lxml')
  • if '当前用户已关闭糗百个人动态' in text:
  • print('当前用户已关闭')
  • return None
  • else:
  • username = soup.find('h2').text
  • result = soup.findAll('div', class_='user-statis')
  • number = result[0].find_all('li')[0].text
  • attentions = result[0].find_all('li')[1].text
  • comments = result[0].find_all('li')[3].text
  • constellation = result[1].find_all('li')[1].text
  • occupation = result[1].find_all('li')[2].text
  • address = result[1].find_all('li')[3].text
  • return username, number, attentions, comments, constellation, occupation, address
  • # 保存到数据库中
  • def save_mydata(self, data):
  • # print (data)
  • if data != None:
  • sql = 'insert into qsbk_user (username,num,attentions,comments,constellation,occupation,address) VALUES (%s,%s,%s,%s,%s,%s,%s)'
  • li = [item.split(":")[-1] for item in data]
  • # print('data=',li) # data= ['绠纱猫猫', '16', '3', '297', '天蝎座', '家里蹲', '湖南 · 长沙']
  • try:
  • self.cursor.execute(sql, tuple(li))
  • self.conn.commit()
  • except Exception as e:
  • print(e)
  • def main(self,url):
  • response = self.get_response(url)
  • try:
  • url_li = self.parse_userurl(response)
  • for item in url_li:
  • user_detail_url = url+item
  • data = self.parse_userdata(self.get_response(user_detail_url))
  • self.save_mydata(data)
  • except IndexError as e:
  • print(e)
  • except Exception as e:
  • print(e)
  • if __name__ == '__main__':
  • qsbk().main(start_url)

 

方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门