python并发模块concurrent中的ThreadPoolExecutor并发爬取图片
该网站有反爬措施,可以换个网址
使用submit方法来往线程池中加入一个task(pow函数),submit返回一个Future对象。其中future.result()的result方法的作用是拿到调用返回的结果。如果没有执行完毕就会去等待。
-
- # 并发爬取
- import time,re,os
- import requests
- import hashlib
- from concurrent.futures import ThreadPoolExecutor
-
- pool = ThreadPoolExecutor(20)
- start_url = 'https://www.dushu.com'
- file_path = './imgs'
- if not os.path.exists(file_path):
- os.makedirs(file_path)
-
- def get_index_page(url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
- 'Referer':url,
- }
- resp = requests.get(url,headers=headers)
- resp.iter_content()
- print(resp.cookies.get_dict())
- if resp.status_code == 200:
- txt = resp.content.decode('utf-8')
- # with open("1.html","w",encoding='utf-8')as f:
- # f.write(txt)
- return txt
-
- # 获取书籍url
- def parse_index(index_page):
- index_page = index_page.result()
- book_urls = re.findall(r'<div class="book-info">.*?<a href="(.*?)"',index_page,re.S)
- for book_url in book_urls:
- pool.submit(get_index_page,start_url + book_url).add_done_callback(get_detail_page)
-
- # 获取封面图url
- def get_detail_page(book_page):
- book_page = book_page.result()
- img_urls = re.findall(r'<div class="pic">.*?<img src="(.*?)"', book_page, re.S)
- if img_urls:
- pool.submit(get_img,img_urls[0])
- # 保存图片
- def get_img(img_url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
- 'Referer': 'https://www.dushu.com/book/13772054/',
- }
- resp = requests.get(img_url,headers=headers)
- print(resp.status_code,book_url)
- m = hashlib.md5()
- m.update(str(time.time()).encode('utf-8'))
- m.update(img_url.encode('utf-8'))
- img_path = '%s/%s.jpg'%(file_path,m.hexdigest())
- with open(img_path,'wb')as f:
- f.write(resp.content)
- print("完成下载")
-
- if __name__ == '__main__':
- base_url = 'https://www.dushu.com/book/1158_{}.html'
- for i in range(1,2):
- time.sleep(1)
- url = base_url.format(i)
- pool.submit(get_index_page,url).add_done_callback(parse_index)
以下是非并发,可以下载图片:
- import time,re,os
- import requests
- import hashlib
-
- start_url = 'https://www.dushu.com'
- file_path = './imgs'
- if not os.path.exists(file_path):
- os.makedirs(file_path)
-
- def get_index_page(url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
- 'Referer':url,
- }
- resp = requests.get(url,headers=headers)
- if resp.status_code == 200:
- txt = resp.content.decode('utf-8')
- # with open("1.html","w",encoding='utf-8')as f:
- # f.write(txt)
- return txt
- # 获取书籍url
- def parse_index(index_page):
- # img_urls = re.findall(r'<div class="book-info">.*?data-original="(.*?)"',index_page,re.S)
- book_urls = re.findall(r'<div class="book-info">.*?<a href="(.*?)"',index_page,re.S)
- for book_url in book_urls:
- yield start_url + book_url
-
- # 获取封面图url
- def get_detail_page(book_page):
- img_urls = re.findall(r'<div class="pic">.*?<img src="(.*?)"', book_page, re.S)
- if img_urls:
- return img_urls[0]
- # 保存图片
- def get_img(img_url,book_url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
- 'Referer': book_url,
- }
- resp = requests.get(img_url,headers=headers)
- m = hashlib.md5()
- m.update(str(time.time()).encode('utf-8'))
- m.update(img_url.encode('utf-8'))
- img_path = '%s/%s.jpg'%(file_path,m.hexdigest())
- with open(img_path,'wb')as f:
- f.write(resp.content)
-
- if __name__ == '__main__':
- base_url = 'https://www.dushu.com/book/1158_{}.html'
- # 控制爬取的页数
- for i in range(1,2):
- time.sleep(1)
- url = base_url.format(i)
- index_page = get_index_page(url)
- book_urls = parse_index(index_page)
- for book_url in book_urls:
- book_page = get_index_page(book_url)
- img_url = get_detail_page(book_page)
- get_img(img_url,book_url)
参考:https://www.cdsy.xyz/computer/programme/Python/241210/cd64984.html
视频例子:https://www.bilibili.com/video/BV1CE411i73L?p=4