您当前的位置：首页 > 计算机 > 编程开发 > Python

网络爬虫爬取图像初体验

时间：03-29来源：作者：点击数：40

step1

浏览器打开要爬取的网站，右键点击查看页面源代码

step2

找到想要爬取的图像的源地址

代码实现

普通网站爬取没有反爬机制的网站

import re
import urllib.request

#要爬取的网页地址
url=r'https://blog.csdn.net/L888666Q/article/details/127208352?spm=1001.2014.3001.5501'
with urllib.request.urlopen(url) as fp:
    #读取网页内容并解码
    content=fp.read().decode()
#正则表达式定位要爬取的图像，对应上述截图中的 img alt 和 src
result=re.findall(r'<img alt.+?src="(.+?)".+?/>',content)
#循环获取并保存爬取的图像
for index,item in enumerate(result):
    print(item)
    with urllib.request.urlopen(item) as fp1:
        content=fp1.read()
        with open(str(index)+'.jpg','wb') as fp2:
            fp2.write(content)
print("finish.")

普通网站爬取带有反爬机制的网站

带有反爬机制的网站urlopen时会报403错误

import re
import urllib.request
import urllib.parse

url=r'https://blog.csdn.net/L888666Q/article/details/127208352?spm=1001.2014.3001.5501'
#设置头 伪装成浏览器(可网上搜索User-Agent大全)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'} 
req = urllib.request.Request(url,headers=headers)
with urllib.request.urlopen(req) as fp:
    content=fp.read().decode()

result=re.findall(r'<img alt.+?src="(.+?)".+?/>',content)
for index,item in enumerate(result):
    #if not item.endswith('jpeg'):
    #    continue
    print(item)
    item=urllib.parse.urljoin(url,item)#如果图像源地址是相对路径，需要将地址合并为https的地址
    with urllib.request.urlopen(item) as fp1:
        content=fp1.read()
        #不用index作为图像命名，取最后一个/后面的字符作为保存图片的名字
        imgname=item[item.rindex('/')+1:]
        with open(imgname,'wb') as fp2:
            fp2.write(content)
print("finish.")