1.TXT文件存储
- #豆瓣电影TOP250首页电影信息抓取
- from pyquery import PyQuery as pq
- import requests
-
- url = 'https://movie.douban.com/top250'
- headers = {
- 'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
- }
- res = requests.get(url=url,headers=headers)
- doc = pq(res.text)
- items = doc('.info').items()
- for item in items:
- name = pq(item.find('.hd').html()).find('span:first-child').text() #此处的find找到所有符合条件的
- score = pq(item.find('.star').html()).find('span:nth-child(2)').text()
- comment = item.find('.quote').text()
- with open('movies.txt','a',encoding='utf-8') as file:
- file.write('\n'.join([name,score,comment]))
- file.write('\n'+"="*20+'\n')
2.JSON文件存储
json,JavaScript Object Notation,通过对象(字典)和数组(列表)的组合来表示数据结构,是一种轻量级的数据交换格式。json数据为字符串类型。
loads()方法将字符串转为json对象,即可操作的数据结构,如字典或列表;
dumps()方法将json对象转化为文本字符串。
2.1读取JSON
- import json
- str = '''
- [{
- "name":"aa",
- "age":18
- },{
- "name":"bb",
- "age":20
- }]
- '''
- print(type(str)) #<class 'str'>
- data = json.loads(str)
- print(data) #<class 'str'>
- print(data[0]['name'])
- #json数据需用双引号,否则报错
- #json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 3 column 5 (char 8)
2.2 写入json
- import json
- data = [{
- "name":"aa",
- "age":18
- },{
- "name":"bb",
- "age":20
- }]
- with open('test.json','w') as file:
- file.write(json.dumps(data))
- from pyquery import PyQuery as pq
- import requests
- import json
-
- url = 'https://movie.douban.com/top250'
- headers = {
- 'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
- }
- res = requests.get(url=url,headers=headers)
- doc = pq(res.text)
- items = doc('.info').items()
- movies_list = []
- for item in items:
- name = pq(item.find('.hd').html()).find('span:first-child').text()
- score = pq(item.find('.star').html()).find('span:nth-child(2)').text()
- comment = item.find('.quote').text()
- movies_list.append([name,score,comment])
- with open('movies.json', 'a', encoding='utf-8') as file:
- file.write(json.dumps(movies_list,indent=4,ensure_ascii=False))
3.CSV文件存储
3.1 写入
列表写入:
- import csv
- with open('test.csv','w') as csvfile: #打开文件,获得文件句柄
- #写入的文本默认以逗号分隔,delimiter可指定分隔符
- writer = csv.writer(csvfile,delimiter=' ') #调用writer()方法初始化文件对象,传入该句柄
- writer.writerow(['name','age'])
- # writer.writerow(['aa',14]) #单行写入
- # writer.writerow(['bb',24])
- # writer.writerow(['cc',25])
- writer.writerows([['dd',23],['ff',18]]) #多行写入
字典写入,一般爬虫数据都是结构化数据:
- #练习:豆瓣电影TOP250,电影名称,评分,推荐语
- from pyquery import PyQuery as pq
- import requests
- import csv
-
- url = 'https://movie.douban.com/top250'
- headers = {
- 'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
- }
- res = requests.get(url=url,headers=headers)
- doc = pq(res.text)
- items = doc('.info').items()
- list_all = []
- for item in items:
- movies_dict = {}
- name = pq(item.find('.hd').html()).find('span:first-child').text() #此处的find找到所有符合条件的
- score = pq(item.find('.star').html()).find('span:nth-child(2)').text()
- comment = item.find('.quote').text()
- movies_dict['name'] = name
- movies_dict['score'] = score
- movies_dict['comment'] = comment
- list_all.append(movies_dict)
- with open('movies.csv','w',encoding='utf-8',newline='') as csvfile: #newline=''解决行与行之间的空白行
- fieldnames = ['name','score','comment']
- writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
- writer.writeheader()
- for row in list_all:
- writer.writerow(row)
3.2读取
- import csv
- with open('test.csv','r',encoding='utf-8') as csvfile:
- reader = csv.reader(csvfile)
- for row in reader:
- print(row)