一.储存为CSV【excel文件】
1.作用
- 将爬取的数据存放到本地的csv文件中,便于用excel打开
2.使用步骤
1、导入模块
2、打开csv文件
3、初始化写入对象
4、写入数据(参数为列表)
import csv
with open('film.csv','w') as f:
writer = csv.writer(f)
writer.writerow([])
writer.writerows([(),(),()])
3.Demo
单行操作
# 单行写入(writerow([]))
import csv
with open('test.csv','w',newline='') as f:
writer = csv.writer(f)
writer.writerow(['姓名','年龄'])
writer.writerow(['Jame','18'])
writer.writerows(['Tim','20'])
writer.writerows(['Dom','33'])
多行操作
# 多行写入(writerows([(),(),()]
import csv
with open('test.csv','w') as f:
writer = csv.writer(f)
writer.writerows([('Jame','36'),('Tim','25'),('Dom','30')])
4.猫眼电影实际案例[csv]
import csv
from urllib import request, parse
import re
import time
import random
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
# 计数
self.num = 0
def get_html(self, url):
headers = {
'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
# 直接调用解析函数
self.parse_html(html)
def parse_html(self, html):
# 创建正则的编译对象
re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
pattern = re.compile(re_, re.S)
# film_list:[('霸王别姬','张国荣','1993')]
film_list = pattern.findall(html)
self.write_html(film_list)
# 存入csv文件-writerrows
def write_html(self, film_list):
L = []
with open('maoyanfilm.csv', 'a',newline='') as f:
# 初始化写入对象,注意参数f不能忘
writer = csv.writer(f)
for film in film_list:
t = (
film[0].strip(),
film[1].strip(),
film[2].strip()[5:15]
)
self.num += 1
L.append(t)
# writerow()参数为列表
writer.writerows(L)
print(L)
def main(self):
for offset in range(0, 91, 10):
url = self.url.format(offset)
self.get_html(url)
time.sleep(random.randint(1, 2))
print('共抓取数据', self.num, "部")
if __name__ == '__main__':
start = time.time()
spider = MaoyanSpider()
spider.main()
end = time.time()
print('执行时间:%.2f' % (end - start))
二.储入Mysql数据库
1.数据库建表
mysql -h127.0.0.1 -uroot -p123456
# 建库建表
create database maoyandb charset utf8;
use maoyandb;
create table filmtab(
name varchar(100),
star varchar(300),
time varchar(50)
)charset=utf8;
2.使用步骤
单行操作
import pymysql
# 创建2个对象
db = pymysql.connect('localhost','root','123456','maoyandb',charset='utf8')
cursor = db.cursor()
# 执行SQL命令并提交到数据库执行
# execute()方法第二个参数为列表传参补位
cursor.execute('insert into filmtab values(%s,%s,%s)',['霸王别姬','张国荣','1993'])
db.commit()
# 关闭
cursor.close()
db.close()
多行操作
import pymysql
# 创建2个对象
db = pymysql.connect('127.0.0.1','root','123456','maoyandb',charset='utf8')
cursor = db.cursor()
# 抓取的数据
film_list = [('活着','葛优','1994'),('恐怖直播','河正宇','2013')]
# 执行SQL命令并提交到数据库执行
# execute()方法第二个参数为列表传参补位
cursor.executemany('insert into filmtab values(%s,%s,%s)',film_list)
db.commit()
# 关闭
cursor.close()
db.close()
3.猫眼电影实际案例[mysql]
import pymysql
from urllib import request, parse
import re
import time
import random
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
# 计数
self.num = 0
self.db = pymysql.connect(
'localhost', 'root', '123456', 'maoyandb', charset='utf8'
)
self.cursor = self.db.cursor()
def get_html(self, url):
headers = {
'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
# 直接调用解析函数
self.parse_html(html)
def parse_html(self, html):
# 创建正则的编译对象
re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
pattern = re.compile(re_, re.S)
# film_list:[('霸王别姬','帅哥','1993')]
film_list = pattern.findall(html)
self.write_html(film_list)
def write_html(self, film_list):
ins = 'insert into filmtab values(%s,%s,%s)'
for film in film_list:
L = [
film[0].strip(),
film[1].strip(),
film[2].strip()[5:15]
]
self.num+=1
print('爬取成功',self.num,'部')
self.cursor.execute(ins, L)
#提交到数据库执行
self.db.commit()
def main(self):
for offset in range(0, 91, 10):
url = self.url.format(offset)
self.get_html(url)
time.sleep(random.randint(1, 2))
print('共抓取数据', self.num, "部")
# 断开数据库
self.cursor.close()
self.db.close()
if __name__ == '__main__':
start = time.time()
spider = MaoyanSpider()
spider.main()
end = time.time()
print('执行时间:%.2f' % (end - start))
三.存入MongoDB
1.特点
- 非关系型数据库
- 库/集合(MySQL中表),文档(MySQL中表记录)
- 无须手动建库建集合
2.常用命令
#查看所有的库
>show dbs
#切换库
>use 库名
#查看库中所有的集合
>show collections
#查看集合中所有文档
>db.集合名.find().pretty()
#统计集合中文档个数
>db.集合名.count()
3.使用步骤
import pymongo
#1.连接对象
conn = pymongo.MongoClient(
host='127.0.0.1',
port=27017
)
#2.库对象
db = conn['maoyandb'] #第一种方法
# db = conn.maoyandb #第二种方法
#3.集合对象
myset = db['filmtab']
#4.插入数据库
#插入一个
myset.insert_one({'name':'赵敏'})
#插入多个
myset.insert_many()
4.猫眼电影实际案例[mongodb]
import pymongo
from urllib import request
import re
import time
import random
from useragents import ua_list
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
# 计数
self.num = 0
# 创建3个对象
self.conn = pymongo.MongoClient(host='127.0.0.1',port=27017)
self.db = self.conn['maoyandb']
self.myset = self.db['filmset']
def get_html(self, url):
headers = {
'User-Agent': random.choice(ua_list)
}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
# 直接调用解析函数
self.parse_html(html)
def parse_html(self, html):
# 创建正则的编译对象
re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
pattern = re.compile(re_, re.S)
# film_list:[('霸王别姬','张国荣','1993')]
film_list = pattern.findall(html)
self.write_html(film_list)
# mysql-executemany
def write_html(self, film_list):
for film in film_list:
film_dict = {
'name':film[0].strip(),
'star':film[1].strip(),
'time':film[2].strip()[5:15]
}
self.num+=1
#插入mongodb数据库
self.myset.insert_one(film_dict)
print('爬取成功', self.num, '部')
def main(self):
for offset in range(0, 91, 10):
url = self.url.format(offset)
self.get_html(url)
time.sleep(random.randint(1, 2))
print('共抓取数据', self.num, "部")
if __name__ == '__main__':
start = time.time()
spider = MaoyanSpider()
spider.main()
end = time.time()
print('执行时间:%.2f' % (end - start))