一.储存为CSV【excel文件】
1.作用
- 将爬取的数据存放到本地的csv文件中,便于用excel打开
2.使用步骤
- 1、导入模块
- 2、打开csv文件
- 3、初始化写入对象
- 4、写入数据(参数为列表)
- import csv
-
- with open('film.csv','w') as f:
- writer = csv.writer(f)
- writer.writerow([])
- writer.writerows([(),(),()])
-
3.Demo
单行操作
-
- import csv
- with open('test.csv','w',newline='') as f:
- writer = csv.writer(f)
- writer.writerow(['姓名','年龄'])
- writer.writerow(['Jame','18'])
- writer.writerows(['Tim','20'])
- writer.writerows(['Dom','33'])
-
多行操作
-
- import csv
- with open('test.csv','w') as f:
- writer = csv.writer(f)
- writer.writerows([('Jame','36'),('Tim','25'),('Dom','30')])
-
4.猫眼电影实际案例[csv]
- import csv
- from urllib import request, parse
- import re
- import time
- import random
-
-
- class MaoyanSpider(object):
- def __init__(self):
- self.url = 'https://maoyan.com/board/4?offset={}'
-
- self.num = 0
-
-
- def get_html(self, url):
- headers = {
- 'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
- }
- req = request.Request(url=url, headers=headers)
- res = request.urlopen(req)
- html = res.read().decode('utf-8')
-
- self.parse_html(html)
-
- def parse_html(self, html):
-
- re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
- pattern = re.compile(re_, re.S)
-
- film_list = pattern.findall(html)
- self.write_html(film_list)
-
-
- def write_html(self, film_list):
- L = []
- with open('maoyanfilm.csv', 'a',newline='') as f:
-
- writer = csv.writer(f)
- for film in film_list:
- t = (
- film[0].strip(),
- film[1].strip(),
- film[2].strip()[5:15]
- )
- self.num += 1
- L.append(t)
-
- writer.writerows(L)
- print(L)
-
-
- def main(self):
- for offset in range(0, 91, 10):
- url = self.url.format(offset)
- self.get_html(url)
- time.sleep(random.randint(1, 2))
- print('共抓取数据', self.num, "部")
-
-
- if __name__ == '__main__':
- start = time.time()
- spider = MaoyanSpider()
- spider.main()
- end = time.time()
- print('执行时间:%.2f' % (end - start))
-
二.储入Mysql数据库
1.数据库建表
- mysql -h127.0.0.1 -uroot -p123456
- # 建库建表
- create database maoyandb charset utf8;
- use maoyandb;
- create table filmtab(
- name varchar(100),
- star varchar(300),
- time varchar(50)
- )charset=utf8;
-
2.使用步骤
单行操作
- import pymysql
-
-
- db = pymysql.connect('localhost','root','123456','maoyandb',charset='utf8')
- cursor = db.cursor()
-
-
-
- cursor.execute('insert into filmtab values(%s,%s,%s)',['霸王别姬','张国荣','1993'])
- db.commit()
-
-
- cursor.close()
- db.close()
-
多行操作
- import pymysql
-
-
- db = pymysql.connect('127.0.0.1','root','123456','maoyandb',charset='utf8')
- cursor = db.cursor()
-
-
- film_list = [('活着','葛优','1994'),('恐怖直播','河正宇','2013')]
-
-
-
- cursor.executemany('insert into filmtab values(%s,%s,%s)',film_list)
- db.commit()
-
-
- cursor.close()
- db.close()
-
-
3.猫眼电影实际案例[mysql]
- import pymysql
- from urllib import request, parse
- import re
- import time
- import random
-
- class MaoyanSpider(object):
- def __init__(self):
- self.url = 'https://maoyan.com/board/4?offset={}'
-
- self.num = 0
- self.db = pymysql.connect(
- 'localhost', 'root', '123456', 'maoyandb', charset='utf8'
- )
- self.cursor = self.db.cursor()
-
- def get_html(self, url):
- headers = {
- 'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
- }
- req = request.Request(url=url, headers=headers)
- res = request.urlopen(req)
- html = res.read().decode('utf-8')
-
- self.parse_html(html)
-
- def parse_html(self, html):
-
- re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
- pattern = re.compile(re_, re.S)
-
- film_list = pattern.findall(html)
- self.write_html(film_list)
-
- def write_html(self, film_list):
- ins = 'insert into filmtab values(%s,%s,%s)'
- for film in film_list:
- L = [
- film[0].strip(),
- film[1].strip(),
- film[2].strip()[5:15]
- ]
- self.num+=1
- print('爬取成功',self.num,'部')
- self.cursor.execute(ins, L)
-
- self.db.commit()
-
- def main(self):
- for offset in range(0, 91, 10):
- url = self.url.format(offset)
-
- self.get_html(url)
- time.sleep(random.randint(1, 2))
- print('共抓取数据', self.num, "部")
-
- self.cursor.close()
- self.db.close()
-
-
- if __name__ == '__main__':
- start = time.time()
- spider = MaoyanSpider()
- spider.main()
- end = time.time()
- print('执行时间:%.2f' % (end - start))
-
三.存入MongoDB
1.特点
- 非关系型数据库
- 库/集合(MySQL中表),文档(MySQL中表记录)
- 无须手动建库建集合
2.常用命令
-
- >show dbs
-
- >use 库名
-
- >show collections
-
- >db.集合名.find().pretty()
-
- >db.集合名.count()
-
3.使用步骤
- import pymongo
-
- conn = pymongo.MongoClient(
- host='127.0.0.1',
- port=27017
- )
-
- db = conn['maoyandb']
-
-
- myset = db['filmtab']
-
-
- myset.insert_one({'name':'赵敏'})
-
- myset.insert_many()
-
4.猫眼电影实际案例[mongodb]
- import pymongo
- from urllib import request
- import re
- import time
- import random
- from useragents import ua_list
-
-
- class MaoyanSpider(object):
- def __init__(self):
- self.url = 'https://maoyan.com/board/4?offset={}'
-
- self.num = 0
-
- self.conn = pymongo.MongoClient(host='127.0.0.1',port=27017)
- self.db = self.conn['maoyandb']
- self.myset = self.db['filmset']
-
- def get_html(self, url):
- headers = {
- 'User-Agent': random.choice(ua_list)
- }
- req = request.Request(url=url, headers=headers)
- res = request.urlopen(req)
- html = res.read().decode('utf-8')
-
- self.parse_html(html)
-
- def parse_html(self, html):
-
- re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
- pattern = re.compile(re_, re.S)
-
- film_list = pattern.findall(html)
- self.write_html(film_list)
-
-
- def write_html(self, film_list):
- for film in film_list:
- film_dict = {
- 'name':film[0].strip(),
- 'star':film[1].strip(),
- 'time':film[2].strip()[5:15]
- }
- self.num+=1
-
- self.myset.insert_one(film_dict)
- print('爬取成功', self.num, '部')
-
-
- def main(self):
- for offset in range(0, 91, 10):
- url = self.url.format(offset)
-
- self.get_html(url)
- time.sleep(random.randint(1, 2))
- print('共抓取数据', self.num, "部")
-
-
- if __name__ == '__main__':
- start = time.time()
- spider = MaoyanSpider()
- spider.main()
- end = time.time()
- print('执行时间:%.2f' % (end - start))
-