爬取单页
- import requests
- from lxml import etree
-
- if __name__=="__main__":
- headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
- }
-
- url='https://www.anquan123.com/'
- resp=requests.get(url=url,headers=headers).text
-
-
-
-
-
-
- tree=etree.HTML(resp)
- ul_list=tree.xpath('/html/body/div[3]/div/div[3]/div[2]/ul/li[2]/ul/li')
-
- for li in ul_list:
-
-
- wang_url = li.xpath('normalize-space(./span/a/text())')
-
- print(wang_url)
-
- name=li.xpath('./span[2]/text()')[0]
- print(name)
-
爬取单页:保存为csv
- import requests
- from lxml import etree
- import pandas as pd
-
- def get_data():
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
- }
-
- url = 'https://www.anquan123.com/'
- resp = requests.get(url=url, headers=headers).text
-
-
-
-
-
- a_url=[]
- a_type=[]
-
-
- tree = etree.HTML(resp)
- ul_list = tree.xpath('/html/body/div[3]/div/div[3]/div[2]/ul/li[2]/ul/li')
-
- for li in ul_list:
-
-
- wang_url = li.xpath('normalize-space(./span/a/text())')
- wang_url = 'https://' + wang_url
- a_url.append(wang_url)
-
-
- name = li.xpath('./span[2]/text()')[0]
-
- a_type.append(name)
-
-
-
- return a_url,a_type
-
-
-
- def Cun(a_url,a_type):
-
- dic={
- '网址':a_url,
- '类型':a_type,
- }
-
-
-
- df = pd.DataFrame(dic)
-
-
-
- df.to_csv("data.csv", encoding='utf-8', index=False)
- print('保存完成!')
-
- if __name__=="__main__":
- a_url,a_type=get_data()
-
-
- Cun(a_url,a_type)
-
-
保存csv
-
- def Cun(a_url,a_type):
-
- dic={
- '网址':a_url,
- '类型':a_type,
- }
-
-
-
- df = pd.DataFrame(dic)
-
-
-
-
-
- df.to_csv("data.csv", encoding='utf-8-sig', index=False)
- print('保存完成!')
-