这次依然是爬取猫眼电影排行榜,然后用将爬取到的数据分别存储为TXT,JSON,CSV或者分别存储在Mysql,Mongodb,Redis等数据库中。
这次操作和依赖技术相对复杂些,既要掌握基本sql语句,也要配置各种数据库环境。
- # coding = utf-8
- __author__ = 'wardseptember'
- __date__ = '18-10-26'
- import json
- import requests
- from requests.exceptions import RequestException
- import re
- from redis import StrictRedis
-
- def get_one_page(url):
- try:
- response = requests.get(url)
- if response.status_code == 200:
- return response.text
- return None
- except RequestException:
- return None
-
- def parse_one_page(html):
- pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
- +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
- +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
- items = re.findall(pattern, html)
- for item in items:
- yield {
- 'index': item[0],
- 'image': item[1],
- 'title': item[2],
- 'actor': item[3].strip()[3:],
- 'time': item[4].strip()[5:],
- 'score': item[5]+item[6]
- }
-
- def write_to_file(content):
- with open('result_re.txt', 'a', encoding='utf-8') as f:
- f.write(json.dumps(content, ensure_ascii=False) + '\n')
- f.close()
-
- import json
- def write_to_file(content):
- with open('result_re.json', 'a', encoding='utf-8') as f:
- f.write(json.dumps(content, ensure_ascii=False) + '\n')
- f.close()
-
- import csv
- def write_to_file(content):
- with open('result_re.csv', 'a', encoding='utf-8') as f:
- fieldnames=['index','image','title','actor','time','score']
- writer=csv.DictWriter(f,fieldnames=fieldnames)
- writer.writeheader()
- writer.writerow(content)
- f.close()
-
-
我的是系统是ubuntu 16.04 LTS,我安装了mysql5.7和可视化工具mysql-workbench安装配置教程在这,就不详细说了。
连接数据库要改host='localhost', user='root', password='156352', port=3306数据
,对应输入你的就行了
- #创建数据库
- def createDatabase():
- db = pymysql.connect(host='localhost', user='root', password='156352', port=3306)
- cursor = db.cursor()
- cursor.execute('SELECT VERSION()')
- data = cursor.fetchone()
- print('Database version:', data)
- cursor.execute("CREATE DATABASE IF NOT EXISTS MaoYanDB DEFAULT CHARACTER SET utf8")
- print("MaoYanDB数据库创建成功")
- db.close()
-
- #创建表
- def createTableMaoYanDB():
- db=pymysql.connect(host='localhost',user='root',password='156352',port=3306,db='MaoYanDB',charset='utf8')
- cursor=db.cursor()
- sql_table = 'CREATE TABLE IF NOT EXISTS maoyan (id INT NOT NULL, title VARCHAR(255) NOT NULL, ' \
- 'image VARCHAR(255) NOT NULL, actor VARCHAR(255) NOT NULL, releasetime VARCHAR(255) NOT NULL, ' \
- 'score FLOAT NOT NULL, PRIMARY KEY(id))'
- cursor.execute(sql_table)
- print("maoyan表创建成功")
- db.close()
- #写入
- def writeMysql(content):
- db=pymysql.connect(host='localhost',user='root',password='156352',port=3306,db='MaoYanDB',charset='utf8')
- cursor=db.cursor()
- sql_insert='INSERT INTO maoyan(id,title,image,actor,releasetime,score) values(%s, %s, %s, %s, %s, %s)'
- cursor.execute(sql_insert,(content))
- db.commit()
- db.close()
-
我的是系统是ubuntu 16.04 LTS,下面是我mongodb配置的笔记,仅供参考。网上有很多教程。
- sudo python3 -m pip install pymongo #安装python模块
-
- #mongodb官方下载连接
- https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/
-
- sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 9DA31620334BD75D9DCB49F368818C72E52529D4
- echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list
-
- sudo apt-get update
- sudo apt-get install -y mongodb-org
-
- 在你的/home/用户名/下新建data文件夹,再在data文件下新建db文件夹
- mongod --port 27017 --dbpath ~/data/db#不能关,进行以下配置
-
- db.createUser({user: 'admin',pwd:'156352',roles:[{role:'root',db:'admin'}]})
-
- #配置
- sudo vi /etc/mongod.conf
-
- sudo service mongod restart
-
- mongodb可视化
- https://robomongo.org/download
-
- def writeMongodb(content):
- client = pymongo.MongoClient('localhost', 27017)
- db = client['maoyan'] # 给数据库命名
- maoyanTable = db['maoyan_top100'] # 表名
- maoyanTable.insert_one(content)
-
- sudo apt-get -y install redis-server
- redis-cli
-
- sudo gedit /etc/redis/redis.conf
- #注销bind
- #bind 127.0.0.1
- #启用密码
- requirepass foobared
-
- sudo /etc/init.d/redis-server restart
- sudo /etc/init.d/redis-server stop
- sudo /etc/init.d/redis-server start
-
- sudo python3 -m pip install redis
-
-
- 安装ruby redis-dump
- https://www.ruby-lang.org/zh_cn/documentation/installation/
- sudo apt-get install ruby-full
- sudo gem install redis-dump
- redis-dump
- redis-load
-
- redis-cli -h 127.0.0.1 -p 6379 -a foobared
-
- redis可视化
- sudo snap install redis-desktop-manager #用这句安装,极度可靠
- http://docs.redisdesktop.com/en/latest/install/#build-from-source
-
- from redis import StrictRedis
- def write_to_file(content):
- redis=StrictRedis(host='localhost',port=6379,db=0,password='foobared')
- redis.lpush('index',content['index'])
- redis.lpush('image',content['image'])
- redis.lpush('title',content['title'])
- redis.lpush('actor',content['actor'])
- redis.lpush('time',content['time'])
- redis.lpush('score',content['score'])
-