目标:下载豆瓣热门电影封面,网址:https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0
思路:分析请求数据获取全部热门电影相关信息,通过url进入每一个具体页面获取图片url并使用多线程下载
一、分析请求数据
热门电影的首页只显示20部电影,点击加载更多后再显示20部电影
点击加载更多后发现请求数据:https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20
打开链接是一个json数据,很显然page_start指的是开始的电影编号,然后page_limit为20即一条请求返回20条数据
通过人工测试,一共有330部热门电影
二、获取json类型数据,保存到txt
在此只截取评分,电影名称(用作图片名称)以及具体页url
- def getRTUTxt():
- f=open('db.txt','a',encoding='utf-8')
- for page_start in range(0,340,20):
- try:
- url="https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start={}".format(page_start)
- r = http.request('GET',url,headers={'User-Agent': str(UserAgent().random)})
- c = r.data.decode('utf-8')
- c = c.replace('false', '"false"')
- c = c.replace('true', '"true"')
- jsonDict = json.loads(c)
- except Exception as e:
- pass
- List = jsonDict['subjects']
- for i in range(len(List)):
- f.write( str(List[i]['rate'])+','+ str(List[i]['title'])+','+ str(List[i]['url'])+ '\n')
- print(page_start)
- f.close()
三、获取图片url
在获取图片url之前需要把写入txt的内容转化为一个list
- def getRTUList():
- resList = []
- with open('db.txt', 'r', encoding='utf-8') as f:
- for line in f:
- resList.append(line)
- f.close()
- resList = list(set(resList))
- return resList
然后通过发送请求获取图片url
- def getImgUrl(url):
- r = http.request('GET', url, headers={'User-Agent': str(UserAgent().random)})
- c = r.data.decode('utf-8', 'ignore')
- soup = BeautifulSoup(c, 'lxml')
- imgUrl = soup.find('a', class_='nbgnbg').find('img').get('src')
- return imgUrl
四、多线程以及完整代码
- from urllib3 import *
- import json
- from fake_useragent import UserAgent
- from bs4 import BeautifulSoup
- import urllib.request as r
- import requests
- import threading
-
- requests.packages.urllib3.disable_warnings()
-
- basepath='D:/PyDownload/'
- http = PoolManager()
- def getRTUTxt():
- f=open('db.txt','a',encoding='utf-8')
- for page_start in range(0,340,20):
- try:
- url="https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start={}".format(page_start)
- r = http.request('GET',url,headers={'User-Agent': str(UserAgent().random)})
- c = r.data.decode('utf-8')
- c = c.replace('false', '"false"')
- c = c.replace('true', '"true"')
- jsonDict = json.loads(c)
- except Exception as e:
- pass
- List = jsonDict['subjects']
- for i in range(len(List)):
- f.write( str(List[i]['rate'])+','+ str(List[i]['title'])+','+ str(List[i]['url'])+ '\n')
- print(page_start)
- f.close()
-
- def getRTUList():
- resList = []
- with open('db.txt', 'r', encoding='utf-8') as f:
- for line in f:
- resList.append(line)
- f.close()
- resList = list(set(resList))
- return resList
-
-
- def getImgUrl(url):
- r = http.request('GET', url, headers={'User-Agent': str(UserAgent().random)})
- c = r.data.decode('utf-8', 'ignore')
- soup = BeautifulSoup(c, 'lxml')
- imgUrl = soup.find('a', class_='nbgnbg').find('img').get('src')
- return imgUrl
-
- def downLoad(imgUrl,filename):
- r.urlretrieve(imgUrl, filename)
-
-
-
- if __name__ == '__main__':
- #getRTUTxt()
- List = getRTUList()
- tList = []
- for i in range(len(List)):
- try:
- title = str(List[i]).split(',')[1]
- imgUrl = getImgUrl(str(List[i]).split(',')[2].replace('\n',''))
- filename=basepath+title+'.jpg'
- t = threading.Thread(target=downLoad,args=(imgUrl,filename))
- t.start()
- tList.append(t)
- except Exception as e:
- pass
-
- for t in tList:
- t.join()
330图大概花了4分钟
PS:执行以下语句虽然没有报错,但是有警告看着难受
- r = http.request('GET',url,headers={'User-Agent': str(UserAgent().random)})
警告代码:
- C:\Users\Jodness\PycharmProjects\DownLoadImg\venv\lib\site-packages\urllib3\connectionpool.py:847:
- InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised.
- See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
- InsecureRequestWarning)
禁用下警告:
-
- import requests
- requests.packages.urllib3.disable_warnings()