https://maoyan.com/board/4
F12 【点击123…页查看请求头的数据变化】
import requests
import re
import time
import random
class MaoyanSpider(object):
# 初始化请求参数
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
self.i = 0
# 获取html网页源码
def get_html(self, url):
html = requests.get(url=url, headers=self.headers).text
# 直接调用解析函数
self.parse_html(html)
# 正则解析函数
def parse_html(self, html):
regex = '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>'
pattern = re.compile(regex, re.S)
# dd_list: [(),(),()]
dd_list = pattern.findall(html)
self.save_html(dd_list)
# 保存数据函数
def save_html(self, dd_list):
item = {}
for dd in dd_list:
item['name'] = dd[0].strip()
item['star'] = dd[1].strip()[3:]
item['time'] = dd[2].strip()[5:15]
print(item)
self.i += 1
# 启动函数
def run(self):
# 循环获取1-10页,切片具体参数【起始,结束,步长】
for offset in range(0, 91, 10):
url = self.url.format(offset)
self.get_html(url)
# 随机休眠3-5秒,控制抓取频率
time.sleep(random.randint(3, 5))
print('电影数量:', self.i)
if __name__ == '__main__':
# 计算运行时间,并调用启动函数
start_time = time.time()
spider = MaoyanSpider()
spider.run()
end_time = time.time()
print('执行时间:%.2f' % (end_time - start_time))