Hybrid proxy middleware
Configures the downloader to use configured proxy based on the domain of a request:
- tor proxy for .onion domains
- gluetun VPN proxy for clearnet domains
Manages the tor circuit through control port enabling as needed ip rotation
ProxyMiddleware(settings)
Bases: HttpProxyMiddleware
Source code in src/onion_peeler/middlewares/proxy.py
| def __init__(self, settings):
super().__init__(settings)
proxy_setting_key = "ONION_PEELER_PROXY"
proxy_cfg = settings.attributes.get(proxy_setting_key)
if hasattr(proxy_cfg, "value"):
proxy_cfg = proxy_cfg.value
if not isinstance(proxy_cfg, dict):
proxy_cfg = {}
self.tor_host = proxy_cfg.get("tor_host", "tor")
self.tor_port = int(proxy_cfg.get("tor_port", 9080))
self.tor_control_host = proxy_cfg.get("tor_control_host", self.tor_host)
self.tor_control_port = int(proxy_cfg.get("tor_control_port", 9051))
self.tor_password = (
proxy_cfg.get("tor_control_password", "")
or os.getenv("TOR_CONTROL_PASSWORD", "")
)
self.tor_rotation_interval = int(proxy_cfg.get("tor_rotation_interval", 10))
vpn_host = proxy_cfg.get("vpn_host", "gluetun")
vpn_port = int(proxy_cfg.get("vpn_port", 8888))
self.tor_proxy = proxy_cfg.get("tor_proxy_url", f"http://{self.tor_host}:{self.tor_port}")
self.vpn_proxy = proxy_cfg.get("vpn_proxy_url", f"http://{vpn_host}:{vpn_port}")
self._tor_controller = None
self._connect_tor_controller()
|
process_request(request, spider)
Route request through appropriate proxy
Source code in src/onion_peeler/middlewares/proxy.py
| def process_request(self, request: Request, spider):
"""Route request through appropriate proxy"""
if self._should_rotate_tor(request, spider):
self._rotate_circuit()
host = (urlparse(request.url).hostname or "").lower()
if host.endswith(".onion") or host.endswith(".exit") or host == ("check.torproject.org"):
request.meta["proxy"] = self.tor_proxy
# Increment the system-wide lock counter
with self._lock:
ProxyMiddleware._tor_req_count += 1
else:
request.meta["proxy"] = self.vpn_proxy
|