在这里我就不讲windows和macos的python是怎么安装的,macos可以再zsh输入python3看看你是否安装了python
pip3 install requests
可以看到以下代码有在header头添加了一个User-Agent,是因为直接去请求的话dou ban会返回418,我们需要模拟浏览器进行请求,发出get请求后就能打印到response了
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}
response = requests.get(
f'https://movie.豆瓣.com/top250?start={startNum}&filter=', headers=headers)
if response.ok:
print(response.text)
time.sleep(1)
else:
print('爬取失败')
pip3 install bs4
拿电影数据的img主图为例,可以看到主图的img标签都有个共性是width为100,那我们就根据这个去查找主图
from bs4 import BeautifulSoup
movies = []
soup = BeautifulSoup(response.text, "html.parser")
soupAllImg = soup.find_all('img', attrs={"width": 100})
for img in soupAllImg:
movies.append({
"mainImg": img.attrs['src']
})
json模块为python自带,记得使用utf-8,不然你的中文会变成一堆Unicode
import json
print('文件写入中...')
_jsonStr = json.dumps(movieList, indent=4, ensure_ascii=False)
with open('movies.json', 'w', encoding='utf-8') as f:
f.write(_jsonStr)
print('写入成功...')
import requests
from bs4 import BeautifulSoup
import os
import json
import time
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}
# 电影数据
# 获取内容
def getContents(response):
movies = []
print('爬取中...')
soup = BeautifulSoup(response.text, "html.parser")
soupAllImg = soup.find_all('img', attrs={"width": 100})
for img in soupAllImg:
movies.append({
"mainImg": img.attrs['src']
})
soupAllName = soup.find_all('div', attrs={"class": "hd"})
for nameDiv in soupAllName:
soupAllRealName = nameDiv.find('a').find('span')
_name = soupAllRealName.string
if "/" not in _name.string:
index = soupAllName.index(nameDiv)
movies[index].update({'name': _name.string})
soupAllDescribe = soup.find_all('span', attrs={"class": "inq"})
for describe in soupAllDescribe:
index = soupAllDescribe.index(describe)
movies[index].update({'describe': describe.string})
return movies
movieList = []
def start(response, startNum):
print('response', response)
_movies = getContents(response)
movieList.extend(_movies)
if startNum == 225:
print('爬取完成,写入中...')
json_str = json.dumps(movieList, indent=4, ensure_ascii=False)
with open('movies.json', 'w', encoding='utf-8') as f:
f.write(json_str)
print('写入成功...')
for startNum in range(0, 250, 25):
print('电影数据爬取中...')
print(f'正在爬取第{startNum+1}项电影数据...')
response = requests.get(
f'https://movie.豆瓣.com/top250?start={startNum}&filter=', headers=headers)
if response.ok:
start(response, startNum)
time.sleep(1)
else:
print('爬取失败')