python爬虫案例--------爬取youdao在线翻译内容
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/3/4 20:45
# @Author : hyang
# @Site :
# @File : scrapy_youdao.py
# @Software: PyCharm
import json
import hashlib as hasher
import requests
import random
import time
import ssl
import urllib3
# 解决某些环境下报<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings() # 关闭警告
start_url = 'http://fanyi.youdao.com/'
post_url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
"Referer": "http://fanyi.youdao.com/"
}
# 得到js加密串
def get_JSKey(r_word):
salt = int(time.time()* 1000) + random.randint(0,9)
md = hasher.md5()
md5_str = ("fanyideskweb" + r_word + str(salt) + "ebSeFb%=XZ%T[KZ)c(sy!").encode('utf-8')
md.update(md5_str);
sign = md.hexdigest()
return {"salt":salt,"sign":sign}
def get_cookies(url):
return requests.get(url).cookies
def get_content(r_word,url,cookies,js_key):
post_data = {
"i": r_word,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": js_key["salt"], # salt
"sign":js_key["sign"], # sign
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTIME",
"typoResult": "false"
}
response = requests.post(url, headers=headers, data=post_data, cookies=cookies)
json_str = response.json()
print(json_str)
if __name__ == '__main__':
#js 加密文件fanyi.min.js
'''
t.asyRequest = function(e) {
var t = e.i,
i = "" + ((new Date).getTime() + parseInt(10 * Math.random(), 10)),
o = n.md5("fanyideskweb" + t + i + "ebSeFb%=XZ%T[KZ)c(sy!");
r && r.abort(),
r = n.ajax({
type: "POST",
contentType: "application/x-www-form-urlencoded; charset=UTF-8",
url: "/bbk/translate_m.do",
data: {
i: e.i,
client: "fanyideskweb",
salt: i,
sign: o,
tgt: e.tgt,
from: e.from,
to: e.to,
doctype: "json",
version: "3.0",
cache: !0
},
'''
r_word = input("please input the word you want to translate : ")
cookies = get_cookies(start_url) # 得到cookies
print('cookies=>', cookies)
js_key = get_JSKey(r_word)
print("js_key=>",js_key)
get_content(r_word,post_url,cookies,js_key) # 得到请求内容后返回的json