浏览器打开要爬取的网站,右键点击查看页面源代码
找到想要爬取的图像的源地址
import re
import urllib.request
#要爬取的网页地址
url=r'https://blog.csdn.net/L888666Q/article/details/127208352?spm=1001.2014.3001.5501'
with urllib.request.urlopen(url) as fp:
#读取网页内容并解码
content=fp.read().decode()
#正则表达式定位要爬取的图像,对应上述截图中的 img alt 和 src
result=re.findall(r'<img alt.+?src="(.+?)".+?/>',content)
#循环获取并保存爬取的图像
for index,item in enumerate(result):
print(item)
with urllib.request.urlopen(item) as fp1:
content=fp1.read()
with open(str(index)+'.jpg','wb') as fp2:
fp2.write(content)
print("finish.")
带有反爬机制的网站urlopen时会报403错误
import re
import urllib.request
import urllib.parse
url=r'https://blog.csdn.net/L888666Q/article/details/127208352?spm=1001.2014.3001.5501'
#设置头 伪装成浏览器(可网上搜索User-Agent大全)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'}
req = urllib.request.Request(url,headers=headers)
with urllib.request.urlopen(req) as fp:
content=fp.read().decode()
result=re.findall(r'<img alt.+?src="(.+?)".+?/>',content)
for index,item in enumerate(result):
#if not item.endswith('jpeg'):
# continue
print(item)
item=urllib.parse.urljoin(url,item)#如果图像源地址是相对路径,需要将地址合并为https的地址
with urllib.request.urlopen(item) as fp1:
content=fp1.read()
#不用index作为图像命名,取最后一个/后面的字符作为保存图片的名字
imgname=item[item.rindex('/')+1:]
with open(imgname,'wb') as fp2:
fp2.write(content)
print("finish.")