1.TXT文件存储
#豆瓣电影TOP250首页电影信息抓取
from pyquery import PyQuery as pq
import requests
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
}
res = requests.get(url=url,headers=headers)
doc = pq(res.text)
items = doc('.info').items()
for item in items:
name = pq(item.find('.hd').html()).find('span:first-child').text() #此处的find找到所有符合条件的
score = pq(item.find('.star').html()).find('span:nth-child(2)').text()
comment = item.find('.quote').text()
with open('movies.txt','a',encoding='utf-8') as file:
file.write('\n'.join([name,score,comment]))
file.write('\n'+"="*20+'\n')
2.JSON文件存储
json,JavaScript Object Notation,通过对象(字典)和数组(列表)的组合来表示数据结构,是一种轻量级的数据交换格式。json数据为字符串类型。
loads()方法将字符串转为json对象,即可操作的数据结构,如字典或列表;
dumps()方法将json对象转化为文本字符串。
2.1读取JSON
import json
str = '''
[{
"name":"aa",
"age":18
},{
"name":"bb",
"age":20
}]
'''
print(type(str)) #<class 'str'>
data = json.loads(str)
print(data) #<class 'str'>
print(data[0]['name'])
#json数据需用双引号,否则报错
#json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 3 column 5 (char 8)
2.2 写入json
import json
data = [{
"name":"aa",
"age":18
},{
"name":"bb",
"age":20
}]
with open('test.json','w') as file:
file.write(json.dumps(data))
from pyquery import PyQuery as pq
import requests
import json
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
}
res = requests.get(url=url,headers=headers)
doc = pq(res.text)
items = doc('.info').items()
movies_list = []
for item in items:
name = pq(item.find('.hd').html()).find('span:first-child').text()
score = pq(item.find('.star').html()).find('span:nth-child(2)').text()
comment = item.find('.quote').text()
movies_list.append([name,score,comment])
with open('movies.json', 'a', encoding='utf-8') as file:
file.write(json.dumps(movies_list,indent=4,ensure_ascii=False))
3.CSV文件存储
3.1 写入
列表写入:
import csv
with open('test.csv','w') as csvfile: #打开文件,获得文件句柄
#写入的文本默认以逗号分隔,delimiter可指定分隔符
writer = csv.writer(csvfile,delimiter=' ') #调用writer()方法初始化文件对象,传入该句柄
writer.writerow(['name','age'])
# writer.writerow(['aa',14]) #单行写入
# writer.writerow(['bb',24])
# writer.writerow(['cc',25])
writer.writerows([['dd',23],['ff',18]]) #多行写入
字典写入,一般爬虫数据都是结构化数据:
#练习:豆瓣电影TOP250,电影名称,评分,推荐语
from pyquery import PyQuery as pq
import requests
import csv
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/79.0.3945.88 Safari/537.36'
}
res = requests.get(url=url,headers=headers)
doc = pq(res.text)
items = doc('.info').items()
list_all = []
for item in items:
movies_dict = {}
name = pq(item.find('.hd').html()).find('span:first-child').text() #此处的find找到所有符合条件的
score = pq(item.find('.star').html()).find('span:nth-child(2)').text()
comment = item.find('.quote').text()
movies_dict['name'] = name
movies_dict['score'] = score
movies_dict['comment'] = comment
list_all.append(movies_dict)
with open('movies.csv','w',encoding='utf-8',newline='') as csvfile: #newline=''解决行与行之间的空白行
fieldnames = ['name','score','comment']
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writeheader()
for row in list_all:
writer.writerow(row)
3.2读取
import csv
with open('test.csv','r',encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
print(row)