处理网址:http://www.qianmu.org/ranking/1528.htm
# 获取qianmu迁木网QS世界大学排名信息
import requests
from lxml import etree
import re
def fetch(start_url):
'''请求并下载网页'''
r = requests.get(start_url)
if r.status_code != 200:
r.raise_for_status()
return r.text
def process_detail(link,length,num):
'''处理详情页面'''
select = etree.HTML(fetch(link).replace('\t','').replace('\n','').replace('\r',''))
data = {}
data['name'] = select.xpath('//*[@id="wikiContent"]/h1/text()')[0].strip()
print("处理进度:[%s]-%d/%d"%(data['name'],num,length))
table = select.xpath('//div[@class="infobox"]')[0]
keys = table.xpath('.//td[1]')
cols = table.xpath('.//td[2]')
keys_list = []
for key in keys:
keys_list.append(''.join(key.xpath('.//text()')))
values = []
for col in cols:
values.append( ''.join(col.xpath('.//text()')))
if len(keys) != len(values):
return None
# for i in range(len(keys)):
# data[keys[i]] = values[i].strip()
data.update(zip(keys_list, values))
return data
def process_data(data):
'''处理数据'''
if data:
# 结果中包含一些无用数据:'9,771*(3)'中的*(3),正则去掉
patt = re.compile(r'\*\(\d+\)')
new_data ={}
for k,v in data.items():
useless_data = (patt.findall(v))
if len(useless_data)>0:
new_data[k] = v.replace(patt.findall(v)[0],'')
else:
new_data[k] = v
print(new_data)
if __name__ == "__main__":
# 入口页面
start_url = 'http://www.qianmu.org/ranking/1528.htm'
html = etree.HTML(fetch(start_url))
links = html.xpath('//div[@class="rankItem"]/table/tbody/tr[position()>1]/td[2]/a[contains(@a,"")]/@href')
length = len(links)
num = 1
for link in links:
data = process_detail(link, length,num)
process_data(data)
num+=1
效果图: