selenium-wire
1 安装
- pip install selenium-wire
-
2 简单例子
- from seleniumwire import webdriver
-
- driver = webdriver.Chrome()
-
- driver.get(
-
- # Access requests via the `requests` attribute
- for request in driver.requests:
- if request.response:
- print(
- request.url, # 请求的url
- request.response.status_code, # 状态码
- request.headers # 请求的headers
- request.response.headers, # 返回的headers
- )
-
3 安装SSL(winodws不需要安装)
-
- sudo apt install openssl
-
-
- sudo yum install openssl
-
-
- sudo apk add openssl
-
4 远程网络驱动程序
- from selenium.webdriver.common.by import By
- from seleniumwire import webdriver
-
- options = {
- 'suppress_connection_errors': False,
- 'auto_config': False,
- 'addr': '0.0.0.0',
- 'port': 8087,
- 'proxy': {
- 'http': <forward proxy details like scheme://user:pass@ip:port>,
- 'https': <forward proxy details like scheme://user:pass@ip:port>,,
- },
- }
-
- chrome_options = webdriver.ChromeOptions()
- chrome_options.add_argument('--proxy-server=kubernetes-service-name:8087')
- chrome_options.add_argument('--ignore-certificate-errors')
- chrome_options.add_argument("--disable-dev-shm-usage");
- chrome_options.add_argument("start-maximized");
- chrome_options.add_argument("disable-infobars");
- chrome_options.add_argument("--disable-extensions")
- chrome_options.add_argument("--disable-gpu");
- chrome_options.add_argument("--no-sandbox");
- chrome_options.add_argument("--user-data-dir=/root/chrome/data")
-
- chrome_options.binary_location = "/opt/google/chrome/chrome"
-
- s = Service("/usr/bin/chromedriver")
- browser = webdriver.Remote('http://selenium-service-name:4444/wd/hub',service=s, desired_capabilities=chrome_options.to_capabilities(), seleniumwire_options=options)
-
- print("Browser setup done.")
-
- try:
- print("Getting yt.")
- browser.get("https://www.youtube.com/")
- print("Saving screenshot for yt")
- browser.save_screenshot('yt.png')
- print("Extracting Xpath.")
- text = browser.find_element(By.XPATH,'/html/body/ytd-app/div/ytd-page-manager/ytd-browse/ytd-two-column-browse'
- '-results-renderer/div[1]/ytd-rich-grid-renderer/div['
- '6]/ytd-rich-item-renderer[1]/div/ytd-rich-grid-media/div[1]/div/div['
- '1]/h3/a/yt-formatted-string').text
- print(f'The title of the first video on youtube is : {text}')
- except Exception as e:
- print(e)
- finally:
- browser.quit()
- print(browser.requests)
-
5 访问请求
- driver.requests
- driver.last_request
- driver.wait_for_request(pat, timeout=10)
- driver.proxy = {
- 'http': 'http://user:pass@192.168.10.100:8888',
- 'https': 'https://user:pass@192.168.10.100:8889',
- }
- driver.har
- driver.iter_requests()
- def request_interceptor(request, response):
- del request.headers['Referer']
- request.headers['Referer'] = 'some_referer'
- driver.request_interceptor = request_interceptor
-
- def response_interceptor(request, response):
- if request.url == 'https://server.com/some/path':
- if request.url == 'https://server.com/some/path':
- response.headers['New-Header'] = 'Some Value'
- driver.response_interceptor = response_interceptor
-
6 Options
- options = {
- 'addr': '192.168.0.10',
- 'port': 9999,
- 'auto_config': True,
- 'ca_cert': '/path/to/ca.crt',
- 'ca_key': '/path/to/ca.key',
- 'disable_capture': True,
- 'disable_encoding': True,
- 'enable_har': True,
- 'exclude_hosts': ['google-analytics.com'],
- 'ignore_http_methods': [],
- 'proxy': {
- 'http': 'http://user:pass@192.168.10.100:8888',
- 'https': 'https://user:pass@192.168.10.100:8889',
- 'no_proxy': 'localhost,127.0.0.1'
- },
- 'request_storage': 'memory',
- 'request_storage_base_dir': '/my/storage/folder',
- 'request_storage_max_size': 100,
- 'verify_ssl': True,
- 'suppress_connection_errors': False
- }
- driver = webdriver.Chrome(seleniumwire_options=options)
-
7 Request 对象
- body
- cert
- date
- headers
- host
- method
- params
- path
- querystring
- response
- url
- ws_messages
- abort(error_code=403)
- create_response(status_code, headers=(), body=b'')
-
8 WebSocketMessage 对象
9 Response 对象
- body
- from seleniumwire.utils import decode
- body = decode(response.body, response.headers.get('Content-Encoding', 'identity'))
-
- date
- headers
- reason
- status_code
-
10 拦截Requests and Responses
- def interceptor(request):
- request.headers['New-Header'] = 'Some Value'
- driver.request_interceptor = interceptor
- driver.get(...)
-
-
- def interceptor(request):
- del request.headers['Referer']
- request.headers['Referer'] = 'some_referer'
- driver.request_interceptor = interceptor
- driver.get(...)
-
-
- def interceptor(request, response):
- if request.url == 'https://server.com/some/path':
- response.headers['New-Header'] = 'Some Value'
- driver.response_interceptor = interceptor
- driver.get(...)
-
-
- def interceptor(request):
- params = request.params
- params['foo'] = 'bar'
- request.params = params
- driver.request_interceptor = interceptor
- driver.get(...)
-
-
- import json
- def interceptor(request):
- if request.method == 'POST' and request.headers['Content-Type'] == 'application/json':
-
- body = request.body.decode('utf-8')
-
- data = json.loads(body)
-
- data['foo'] = 'bar'
-
- request.body = json.dumps(data).encode('utf-8')
-
- del request.headers['Content-Length']
- request.headers['Content-Length'] = str(len(request.body))
- driver.request_interceptor = interceptor
- driver.get(...)
-
-
- import base64
- auth = (
- base64.encodebytes('my_username:my_password'.encode())
- .decode()
- .strip()
- )
- def interceptor(request):
- if request.host == 'host_that_needs_auth':
- request.headers['Authorization'] = f'Basic {auth}'
- driver.request_interceptor = interceptor
- driver.get(...)
-
-
- def interceptor(request):
-
- if request.path.endswith(('.png', '.jpg', '.gif')):
- request.abort()
- driver.request_interceptor = interceptor
- driver.get(...)
-
-
- def interceptor(request):
- if request.url == 'https://server.com/some/path':
- request.create_response(
- status_code=200,
- headers={'Content-Type': 'text/html'},
- body='<html>Hello World!</html>'
- )
- driver.request_interceptor = interceptor
- driver.get(...)
-
-
- del driver.request_interceptor
- del driver.response_interceptor
-
11 限制Request
- driver.scopes = [
- '.*stackoverflow.*',
- '.*github.*'
- ]
- driver.get(...)
-
-
- options = {
- 'disable_capture': True
- }
- driver = webdriver.Chrome(seleniumwire_options=options)
-
-
- options = {
- 'exclude_hosts': ['host1.com', 'host2.com']
- }
- driver = webdriver.Chrome(seleniumwire_options=options)
-
-
- def interceptor(request):
-
- if request.path.endswith(('.png', '.jpg', '.gif')):
- request.abort()
- driver.request_interceptor = interceptor
- driver.get(...)
-
12 代理
- options = {
- 'proxy': {
- 'http': 'http://192.168.10.100:8888',
- 'https': 'https://192.168.10.100:8888',
- 'no_proxy': 'localhost,127.0.0.1'
- }
- }
- driver = webdriver.Chrome(seleniumwire_options=options)
-
-
- options = {
- 'proxy': {
- 'https': 'https://user:pass@192.168.10.100:8888',
- }
- }
-
-
- options = {
- 'proxy': {
- 'https': 'https://192.168.10.100:8888',
- 'custom_authorization': 'Bearer mytoken123'
- }
- }
-
-
- $ export HTTP_PROXY="http://192.168.10.100:8888"
- $ export HTTPS_PROXY="https://192.168.10.100:8888"
- $ export NO_PROXY="localhost,127.0.0.1"
-
-
- options = {
- 'proxy': {
- 'http': 'socks5://user:pass@192.168.10.100:8888',
- 'https': 'socks5://user:pass@192.168.10.100:8888',
- 'no_proxy': 'localhost,127.0.0.1'
- }
- }
- driver = webdriver.Chrome(seleniumwire_options=options)
-
-
- driver.get(...)
-
- driver.proxy = {
- 'https': 'https://user:pass@192.168.10.100:8888',
- }
- driver.get(...)
-
13 机器人检测
- pip install undetected-chromedriver
-
- import seleniumwire.undetected_chromedriver as uc
- chrome_options = uc.ChromeOptions()
- driver = uc.Chrome(
- options=chrome_options,
- seleniumwire_options={}
- )
-