【Python】a链接以及标题爬取
- # -*- coding:utf-8 -*-
- # python 3.7
- #引入系统类库
- import sys
- # 使用文档解析类库
- from bs4 import BeautifulSoup
- # 使用网络请求类库
- import urllib.request
- # 输入网址
- html_doc = "http://it-cxy.top/"
- if len(sys.argv)>1:
- website=sys.argv[1]
- if(website is not None):
- html_doc= sys.argv[1]
- # 获取请求
- req = urllib.request.Request(html_doc)
- # 打开页面
- webpage = urllib.request.urlopen(req)
- # 读取页面内容
- html = webpage.read()
- # 解析成文档对象
- soup = BeautifulSoup(html, 'html.parser') #文档对象
- # 非法URL 1
- invalidLink1='#'
- # 非法URL 2
- invalidLink2='javascript:void(0)'
- # 集合
- result=set()
- # 计数器
- mycount=0
- #查找文档中所有a标签
- for k in soup.find_all(class_ = 'list url-list'):
- #print(k)
- #查找href标签
- name=k.get('link-title')
- link=k.get('link-url')
- # 过滤没找到的
- if(link is not None):
- #过滤非法链接
- if link==invalidLink1:
- pass
- elif link==invalidLink2:
- pass
- elif link.find("javascript:")!=-1:
- pass
- else:
- mycount=mycount+1
- #print(mycount,link)
- result.add(name+" "+link)
- #print("打印超链接个数:",mycount)
- #print("打印超链接列表",result)
- f = open(r'result.txt','w',encoding='utf-8') #文件路径、操作模式、编码 # r''
- for a in result:
- f.write(a+"\n")
- f.close()
- print("\r\n扫描结果已写入到result.txt文件中\r\n")