爬取单页
import requests
from lxml import etree
if __name__=="__main__":
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
url='https://www.anquan123.com/'
resp=requests.get(url=url,headers=headers).text
# fp=open('a.html','w',encoding='utf-8')
# fp.write(resp)
# fp.close()
#数据解析
tree=etree.HTML(resp)
ul_list=tree.xpath('/html/body/div[3]/div/div[3]/div[2]/ul/li[2]/ul/li')
#print(ul_list)
for li in ul_list:
#wang_url=li.xpath('./span/a/text()')[0]
#去除空格
wang_url = li.xpath('normalize-space(./span/a/text())')
#wang_url='https://'+wang_url
print(wang_url)
name=li.xpath('./span[2]/text()')[0]
print(name)
爬取单页:保存为csv
import requests
from lxml import etree
import pandas as pd
def get_data():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
url = 'https://www.anquan123.com/'
resp = requests.get(url=url, headers=headers).text
# fp=open('a.html','w',encoding='utf-8')
# fp.write(resp)
# fp.close()
a_url=[] #存储网址
a_type=[] #存储类型
# 数据解析
tree = etree.HTML(resp)
ul_list = tree.xpath('/html/body/div[3]/div/div[3]/div[2]/ul/li[2]/ul/li')
# print(ul_list)
for li in ul_list:
# wang_url=li.xpath('./span/a/text()')[0]
# 去除空格
wang_url = li.xpath('normalize-space(./span/a/text())')
wang_url = 'https://' + wang_url
a_url.append(wang_url)
#print(wang_url)
name = li.xpath('./span[2]/text()')[0]
#print(name)
a_type.append(name)
# print(a_url)
# print(a_type)
return a_url,a_type
#存储为csv网址
def Cun(a_url,a_type):
#字典
dic={
'网址':a_url,
'类型':a_type,
}
#print(dic)
# 将字典转换成为数据框
df = pd.DataFrame(dic)
#print(dic)
# 保存到本地csv
df.to_csv("data.csv", encoding='utf-8', index=False)
print('保存完成!')
if __name__=="__main__":
a_url,a_type=get_data()
# print(a_url)
# print(a_type)
Cun(a_url,a_type)
保存csv
#存储为csv网址
def Cun(a_url,a_type):
#字典
dic={
'网址':a_url,
'类型':a_type,
}
#print(dic)
# 将字典转换成为数据框
df = pd.DataFrame(dic)
#print(dic)
# 保存到本地csv
#df.to_csv("data.csv", encoding='utf-8', index=False) #EXcel打开后乱码
df.to_csv("data.csv", encoding='utf-8-sig', index=False) #解决EXcel打开后乱码问题
print('保存完成!')