-
-
-
- import urllib.request
- import urllib
- urllib2 = urllib.request
-
- def loadPage(url,filename):
- """
- 作用:根据url发送请求,获取服务器响应文件
- url:需要爬取的url地址
- filename:处理的文件名
- """
- print("正在下载"+filename)
- headers ={"User-Agent":
- "Mozilla/5.0 (Windows NT 6.1;) Apple....\
- Mozilla/5.0 (X11; Cros i686 2268.111.0)...\
- Mozilla/5.0 (Macintosh; U; PPC Mac OS X)...\
- Mozilla/5.0 (Macintosh;Intel Mac OS)..."
- }
-
- request = urllib2.Request(url,headers=headers)
- response = urllib2.urlopen(request)
- html = response.read()
- return html
-
- def writePage(html,filename):
- """
- 作用:将html内容写入到本地
- html:服务器响应文件内容
- """
- print("正在保存"+filename)
-
-
- with open(filename,"w+",encoding='utf-8') as f:
- f.write(html.decode("utf-8"))
-
- print('-'*30)
-
- def tiebaSpider(url,beginPage,endPage,kw):
-
- """
- 贴吧爬虫调度器,负责组合处理每个页面的url地址
- url:贴吧url的前半部分
- beginPage:起始页
- endPage:结束页
- """
- for page in range(beginPage,endPage+1):
- pn = (page-1)*50
- filename = str(kw)+"贴吧第" + str(page) + "页.html"
- fullurl = url + "&pn=" + str(pn)
-
- html = loadPage(fullurl,filename)
-
- writePage(html,filename)
- print("已完成,谢谢使用!")
-
- if __name__ == "__main__":
- kw = input("请输入需要爬取贴吧名字:")
- while True:
- try:
- beginPage = int(input("请输入要抓取'%s'贴吧的开始页数:"%kw))
- endPage = int(input("请输入要抓取'%s'贴吧的结束页数:"%kw))
- if int(endPage)>= int(beginPage):
- url = "http://tieba.baidu.com/f?"
- key = urllib.parse.urlencode({"kw":kw})
- fullurl = url + key
- tiebaSpider(fullurl, beginPage,endPage,kw)
- break
- else:
- print("请输入正确的页数,并且结束页数要大于等于开始页数")
- except:
- print("请输入正确的页数!")

