2025年3月25日 星期二 甲辰(龙)年 月廿四 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

python爬虫 爬取图片

时间:11-05来源:作者:点击数:52

爬取单张图片

  • #网址:https://pic.netbian.com/ 我们爬取:https://pic.netbian.com/4kmeinv/
  • from lxml import etree
  • import requests
  • import os
  • if __name__=="__main__":
  • if not os.path.exists('zhaopian'):
  • os.mkdir('zhaopian')
  • headers={
  • 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
  • }
  • url='https://pic.netbian.com/4kmeinv/'
  • response=requests.get(url=url,headers=headers)
  • #response.encoding= 'utf-8'
  • resp=response.text
  • #数据解析
  • tree=etree.HTML(resp)
  • li_list=tree.xpath('//div[@class="slist"]/ul/li')
  • for li in li_list:
  • tupian_src='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
  • #print(tupian_src)
  • name=li.xpath('./a/b/text()')[0]+'.jpg'
  • name = name.encode('iso-8859-1').decode('gbk')
  • #print(name)
  • data=requests.get(url=tupian_src,headers=headers).content
  • #存储地址
  • path='zhaopian/'+name
  • with open(path,'wb') as fp:
  • fp.write(data)
  • print("下载成功!!!")
  • import os
  • import requests
  • from lxml import etree
  • header = {
  • "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.5211 SLBChan/25"
  • }
  • url = "https://pic.netbian.com/4kmeinv/"
  • girl_data=requests.get(url=url,headers=header).text
  • # 处理乱码 .decode('gbk') 可去掉
  • girl_data=girl_data.encode("iso-8859-1").decode('gbk')
  • #实例化模型
  • girl_etree=etree.HTML(girl_data)
  • #xpath表达式代码后说明
  • picture_loc=girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@src")
  • picture_name_list=girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@alt")
  • #新增文件夹
  • if not os.path.exists("you_knew_about_picture"):
  • os.mkdir("./you_knew_about_picture")
  • #增加索引定 picture_name_list
  • for i,each_loc in enumerate(picture_loc):
  • #完整网址
  • new_loc="https://pic.netbian.com/"+each_loc
  • #爬取图片
  • each_picture_data=requests.get(new_loc,headers=header).content
  • #each_picture_name由文件路径和名组成
  • each_picture_name="you_knew_about_picture/"+picture_name_list[i]+".jpg"
  • #打开文件
  • fp=open(each_picture_name,mode="wb")
  • #写入
  • fp.write(each_picture_data)
  • fp.close()
  • #提示完成
  • print(each_picture_name.split("/")[-1]+" have been over")

爬取多张图片

  • #爬取多页:第一页;https://pic.netbian.com/4kmeinv/
  • #第二页:https://pic.netbian.com/4kmeinv/index_2.html
  • #第三页:https://pic.netbian.com/4kmeinv/index_3.html
  • import requests
  • from lxml import etree
  • import os
  • #爬取2-4
  • if __name__=="__main__":
  • if not os.path.exists('zhao'):
  • os.mkdir('zhao')
  • headers = {
  • 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
  • }
  • for i in range(2,5):
  • url=f'https://pic.netbian.com/4kmeinv/index_{i}.html'
  • #print(url)
  • resp=requests.get(url=url,headers=headers).text
  • tree=etree.HTML(resp)
  • li_list=tree.xpath('//*[@id="main"]/div[3]/ul/li')
  • for li in li_list:
  • src_url='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
  • #print(src_url)
  • src_name=li.xpath('./a/b/text()')[0]+'.jpg'
  • name=src_name.encode('iso-8859-1').decode('gbk')
  • #print(name)
  • data=requests.get(url=src_url,headers=headers).content
  • path='zhao/'+name
  • with open(path,'wb') as fp:
  • fp.write(data)
  • print('下载成功!!!')

别人代码

  • import os
  • import requests
  • from lxml import etree
  • header = {
  • "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.5211 SLBChan/25"
  • }
  • picture_loc = [] # 图片存地址
  • picture_name_list = [] # 存图片名
  • # 第2,3页图片,可自行调节
  • # 不能包括1,因为1页面网址和后面网址不一样,如想包括,可添加if条件判断
  • for i in range(2, 4):
  • # 一定要在循环内,否则一直为"https://pic.netbian.com/4kmeinv/index_2.html"
  • # 关于为什么后面是/4kmeinv/index_{0}.html 代码后讲解
  • url = "https://pic.netbian.com/4kmeinv/index_{0}.html"
  • url = url.format(i)
  • girl_data = requests.get(url=url, headers=header).text
  • girl_data = girl_data.encode("iso-8859-1").decode('gbk')
  • girl_etree = etree.HTML(girl_data, )
  • # 地址压入
  • picture_loc.extend(girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@src"))
  • # 图片名压入
  • picture_name_list.extend(girl_etree.xpath("//ul[@class='clearfix']/li/a/b/text()"))
  • if not os.path.exists("you_knew_about_picture"):
  • os.mkdir("./you_knew_about_picture")
  • a = 0 # 记录图片个数
  • for i, each_loc in enumerate(picture_loc):
  • new_loc = "https://pic.netbian.com/" + each_loc
  • each_picture_data = requests.get(new_loc, headers=header).content
  • each_picture_name = "you_knew_about_picture/" + str(a) + " . " + picture_name_list[i] + ".jpg"
  • fp = open(each_picture_name, mode="wb")
  • fp.write(each_picture_data)
  • fp.close()
  • print(each_picture_name.split("/")[-1] + " have been over")
  • a = a + 1
  • print(a)
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门