这次依然是爬取猫眼电影排行榜,然后用将爬取到的数据分别存储为TXT,JSON,CSV或者分别存储在Mysql,Mongodb,Redis等数据库中。
这次操作和依赖技术相对复杂些,既要掌握基本sql语句,也要配置各种数据库环境。
# coding = utf-8
__author__ = 'wardseptember'
__date__ = '18-10-26'
import json
import requests
from requests.exceptions import RequestException
import re
from redis import StrictRedis
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5]+item[6]
}
def write_to_file(content):
with open('result_re.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
import json
def write_to_file(content):
with open('result_re.json', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
import csv
def write_to_file(content):
with open('result_re.csv', 'a', encoding='utf-8') as f:
fieldnames=['index','image','title','actor','time','score']
writer=csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
writer.writerow(content)
f.close()
我的是系统是ubuntu 16.04 LTS,我安装了mysql5.7和可视化工具mysql-workbench安装配置教程在这,就不详细说了。
连接数据库要改host='localhost', user='root', password='156352', port=3306数据
,对应输入你的就行了
#创建数据库
def createDatabase():
db = pymysql.connect(host='localhost', user='root', password='156352', port=3306)
cursor = db.cursor()
cursor.execute('SELECT VERSION()')
data = cursor.fetchone()
print('Database version:', data)
cursor.execute("CREATE DATABASE IF NOT EXISTS MaoYanDB DEFAULT CHARACTER SET utf8")
print("MaoYanDB数据库创建成功")
db.close()
#创建表
def createTableMaoYanDB():
db=pymysql.connect(host='localhost',user='root',password='156352',port=3306,db='MaoYanDB',charset='utf8')
cursor=db.cursor()
sql_table = 'CREATE TABLE IF NOT EXISTS maoyan (id INT NOT NULL, title VARCHAR(255) NOT NULL, ' \
'image VARCHAR(255) NOT NULL, actor VARCHAR(255) NOT NULL, releasetime VARCHAR(255) NOT NULL, ' \
'score FLOAT NOT NULL, PRIMARY KEY(id))'
cursor.execute(sql_table)
print("maoyan表创建成功")
db.close()
#写入
def writeMysql(content):
db=pymysql.connect(host='localhost',user='root',password='156352',port=3306,db='MaoYanDB',charset='utf8')
cursor=db.cursor()
sql_insert='INSERT INTO maoyan(id,title,image,actor,releasetime,score) values(%s, %s, %s, %s, %s, %s)'
cursor.execute(sql_insert,(content))
db.commit()
db.close()
我的是系统是ubuntu 16.04 LTS,下面是我mongodb配置的笔记,仅供参考。网上有很多教程。
sudo python3 -m pip install pymongo #安装python模块
#mongodb官方下载连接
https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 9DA31620334BD75D9DCB49F368818C72E52529D4
echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list
sudo apt-get update
sudo apt-get install -y mongodb-org
在你的/home/用户名/下新建data文件夹,再在data文件下新建db文件夹
mongod --port 27017 --dbpath ~/data/db#不能关,进行以下配置
db.createUser({user: 'admin',pwd:'156352',roles:[{role:'root',db:'admin'}]})
#配置
sudo vi /etc/mongod.conf
sudo service mongod restart
mongodb可视化
https://robomongo.org/download
def writeMongodb(content):
client = pymongo.MongoClient('localhost', 27017)
db = client['maoyan'] # 给数据库命名
maoyanTable = db['maoyan_top100'] # 表名
maoyanTable.insert_one(content)
sudo apt-get -y install redis-server
redis-cli
sudo gedit /etc/redis/redis.conf
#注销bind
#bind 127.0.0.1
#启用密码
requirepass foobared
sudo /etc/init.d/redis-server restart
sudo /etc/init.d/redis-server stop
sudo /etc/init.d/redis-server start
sudo python3 -m pip install redis
安装ruby redis-dump
https://www.ruby-lang.org/zh_cn/documentation/installation/
sudo apt-get install ruby-full
sudo gem install redis-dump
redis-dump
redis-load
redis-cli -h 127.0.0.1 -p 6379 -a foobared
redis可视化
sudo snap install redis-desktop-manager #用这句安装,极度可靠
http://docs.redisdesktop.com/en/latest/install/#build-from-source
from redis import StrictRedis
def write_to_file(content):
redis=StrictRedis(host='localhost',port=6379,db=0,password='foobared')
redis.lpush('index',content['index'])
redis.lpush('image',content['image'])
redis.lpush('title',content['title'])
redis.lpush('actor',content['actor'])
redis.lpush('time',content['time'])
redis.lpush('score',content['score'])