python并发模块concurrent中的ThreadPoolExecutor并发爬取图片
该网站有反爬措施,可以换个网址
使用submit方法来往线程池中加入一个task(pow函数),submit返回一个Future对象。其中future.result()的result方法的作用是拿到调用返回的结果。如果没有执行完毕就会去等待。
# 并发爬取
import time,re,os
import requests
import hashlib
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(20)
start_url = 'https://www.dushu.com'
file_path = './imgs'
if not os.path.exists(file_path):
os.makedirs(file_path)
def get_index_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Referer':url,
}
resp = requests.get(url,headers=headers)
resp.iter_content()
print(resp.cookies.get_dict())
if resp.status_code == 200:
txt = resp.content.decode('utf-8')
# with open("1.html","w",encoding='utf-8')as f:
# f.write(txt)
return txt
# 获取书籍url
def parse_index(index_page):
index_page = index_page.result()
book_urls = re.findall(r'<div class="book-info">.*?<a href="(.*?)"',index_page,re.S)
for book_url in book_urls:
pool.submit(get_index_page,start_url + book_url).add_done_callback(get_detail_page)
# 获取封面图url
def get_detail_page(book_page):
book_page = book_page.result()
img_urls = re.findall(r'<div class="pic">.*?<img src="(.*?)"', book_page, re.S)
if img_urls:
pool.submit(get_img,img_urls[0])
# 保存图片
def get_img(img_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Referer': 'https://www.dushu.com/book/13772054/',
}
resp = requests.get(img_url,headers=headers)
print(resp.status_code,book_url)
m = hashlib.md5()
m.update(str(time.time()).encode('utf-8'))
m.update(img_url.encode('utf-8'))
img_path = '%s/%s.jpg'%(file_path,m.hexdigest())
with open(img_path,'wb')as f:
f.write(resp.content)
print("完成下载")
if __name__ == '__main__':
base_url = 'https://www.dushu.com/book/1158_{}.html'
for i in range(1,2):
time.sleep(1)
url = base_url.format(i)
pool.submit(get_index_page,url).add_done_callback(parse_index)
以下是非并发,可以下载图片:
import time,re,os
import requests
import hashlib
start_url = 'https://www.dushu.com'
file_path = './imgs'
if not os.path.exists(file_path):
os.makedirs(file_path)
def get_index_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Referer':url,
}
resp = requests.get(url,headers=headers)
if resp.status_code == 200:
txt = resp.content.decode('utf-8')
# with open("1.html","w",encoding='utf-8')as f:
# f.write(txt)
return txt
# 获取书籍url
def parse_index(index_page):
# img_urls = re.findall(r'<div class="book-info">.*?data-original="(.*?)"',index_page,re.S)
book_urls = re.findall(r'<div class="book-info">.*?<a href="(.*?)"',index_page,re.S)
for book_url in book_urls:
yield start_url + book_url
# 获取封面图url
def get_detail_page(book_page):
img_urls = re.findall(r'<div class="pic">.*?<img src="(.*?)"', book_page, re.S)
if img_urls:
return img_urls[0]
# 保存图片
def get_img(img_url,book_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Referer': book_url,
}
resp = requests.get(img_url,headers=headers)
m = hashlib.md5()
m.update(str(time.time()).encode('utf-8'))
m.update(img_url.encode('utf-8'))
img_path = '%s/%s.jpg'%(file_path,m.hexdigest())
with open(img_path,'wb')as f:
f.write(resp.content)
if __name__ == '__main__':
base_url = 'https://www.dushu.com/book/1158_{}.html'
# 控制爬取的页数
for i in range(1,2):
time.sleep(1)
url = base_url.format(i)
index_page = get_index_page(url)
book_urls = parse_index(index_page)
for book_url in book_urls:
book_page = get_index_page(book_url)
img_url = get_detail_page(book_page)
get_img(img_url,book_url)
参考:https://www.cdsy.xyz/computer/programme/Python/241210/cd64984.html
视频例子:https://www.bilibili.com/video/BV1CE411i73L?p=4