Scrapy Custom Middleware for Intelligent Proxy Rotation
Scrapy's built-in proxy handling is a single-line request.meta["proxy"] = "..." hook. That's fine for a fixed proxy, but it doesn't give you session pooling, failure tracking, automatic blacklisting, or retry with a fresh identity. For those, you write custom downloader middleware.
This guide builds two middlewares: HexProxyMiddleware (assigns sessions and tracks failures) and SmartRetryMiddleware (retries failed requests on a different session). They slot into Scrapy's downloader pipeline and integrate with its stats collector so you see failure rates in the crawl summary.
settings.py wiring
# settings.py excerpt
DOWNLOADER_MIDDLEWARES = {
"scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
"scraper.middlewares.HexProxyMiddleware": 350,
"scraper.middlewares.SmartRetryMiddleware": 550,
}
HEX_PROXY_GATEWAY = "http://gate.hexproxies.com:7777"
HEX_PROXY_USER = "USER"
HEX_PROXY_PASS = "PASS"
HEX_PROXY_BLACKLIST_COOLDOWN = 300 # seconds
HEX_PROXY_MAX_FAILURES = 5
HEX_PROXY_SESSION_POOL_SIZE = 20
Priority ordering matters. Lower number = earlier in the request pipeline, later in the response pipeline. By running HexProxyMiddleware at 350 (early on request, late on response) and SmartRetryMiddleware at 550 (later on request, earlier on response), the retry middleware sees the response first and can bounce the request back through the proxy middleware with a fresh session.
We also disable Scrapy's default RetryMiddleware because it doesn't know about session pooling — it would retry on the same failed IP.
Imports and session type
Twisted exceptions are the bane of Scrapy beginners. Catching plain Exception is a bad habit — you want to retry on transport errors but not on programming errors. The RETRYABLE_EXCEPTIONS tuple lists the specific Twisted transport errors that indicate a proxy-side or network issue.
# scraper/middlewares.py
import base64
import logging
import random
import time
from dataclasses import dataclass, field
from typing import Optional
from scrapy import Request, Spider
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Response
from scrapy.utils.response import response_status_message
from twisted.internet.error import (
ConnectionRefusedError,
ConnectionLost,
TCPTimedOutError,
TimeoutError as TxTimeoutError,
)
from twisted.web._newclient import ResponseNeverReceived
from twisted.internet.defer import TimeoutError as TwistedTimeoutError
logger = logging.getLogger(__name__)
RETRYABLE_EXCEPTIONS = (
ConnectionRefusedError,
ConnectionLost,
TCPTimedOutError,
TxTimeoutError,
TwistedTimeoutError,
ResponseNeverReceived,
IOError,
)
@dataclass
class SessionEntry:
session_id: str
failures: int = 0
blacklisted_until: float = 0.0
def available(self, now: float) -> bool:
return now >= self.blacklisted_until
The proxy middleware
class HexProxyMiddleware:
"""Assigns a rotating Hex Proxies gateway session to every request.
Each outgoing request gets a sticky session ID from a bounded pool.
Sessions accumulate failure counts; after HEX_PROXY_MAX_FAILURES
the session is blacklisted for HEX_PROXY_BLACKLIST_COOLDOWN seconds.
"""
def __init__(self, gateway: str, user: str, password: str,
pool_size: int, max_failures: int, cooldown: int):
self.gateway = gateway
self.user = user
self.password = password
self.max_failures = max_failures
self.cooldown = cooldown
self.sessions = [
SessionEntry(session_id=f"s{i:03d}") for i in range(pool_size)
]
@classmethod
def from_crawler(cls, crawler):
s = crawler.settings
if not s.get("HEX_PROXY_GATEWAY"):
raise NotConfigured("HEX_PROXY_GATEWAY not set")
return cls(
gateway=s.get("HEX_PROXY_GATEWAY"),
user=s.get("HEX_PROXY_USER"),
password=s.get("HEX_PROXY_PASS"),
pool_size=s.getint("HEX_PROXY_SESSION_POOL_SIZE", 20),
max_failures=s.getint("HEX_PROXY_MAX_FAILURES", 5),
cooldown=s.getint("HEX_PROXY_BLACKLIST_COOLDOWN", 300),
)
def _pick_session(self) -> Optional[SessionEntry]:
now = time.monotonic()
live = [s for s in self.sessions if s.available(now)]
if not live:
return None
return random.choice(live)
def process_request(self, request: Request, spider: Spider):
# If we already assigned a session to this request (e.g. during retry),
# keep it so we retry on the same IP the same number of times first.
if "hex_session" in request.meta:
session = request.meta["hex_session"]
else:
session = self._pick_session()
if session is None:
raise IgnoreRequest("all proxy sessions blacklisted")
request.meta["hex_session"] = session
user = f"{self.user}-session-{session.session_id}"
auth = base64.b64encode(f"{user}:{self.password}".encode()).decode()
request.meta["proxy"] = self.gateway
request.headers[b"Proxy-Authorization"] = f"Basic {auth}".encode()
def process_response(self, request: Request, response: Response, spider: Spider):
session: SessionEntry = request.meta.get("hex_session")
if session is None:
return response
if response.status in (403, 407, 429) or response.status >= 500:
self._record_failure(session, spider, reason=str(response.status))
else:
session.failures = 0
return response
def process_exception(self, request: Request, exception, spider: Spider):
if not isinstance(exception, RETRYABLE_EXCEPTIONS):
return None
session: SessionEntry = request.meta.get("hex_session")
if session is not None:
self._record_failure(session, spider, reason=type(exception).__name__)
return None # let the retry middleware handle retry
def _record_failure(self, session: SessionEntry, spider: Spider, reason: str):
session.failures += 1
spider.crawler.stats.inc_value(f"hex_proxy/failure/{reason}")
if session.failures >= self.max_failures:
session.blacklisted_until = time.monotonic() + self.cooldown
session.failures = 0
spider.crawler.stats.inc_value("hex_proxy/blacklisted")
logger.warning(
"blacklisted session %s for %ds (reason=%s)",
session.session_id, self.cooldown, reason,
)
The session ID scheme uses Hex's sticky session convention: USER-session-{id}. Each session ID maps to a sticky upstream IP for up to 30 minutes. With a pool of 20 sessions, you can parallelize 20 logical identities without burning through the full residential pool. When a session accumulates 5 failures, it's put in cooldown for 5 minutes — by the time the cooldown expires, Hex has rotated the underlying IP anyway.
Smart retry
The retry middleware is small but load-bearing. The key line is new_request.meta.pop("hex_session", None) — by removing the session from the retry request, we force HexProxyMiddleware to pick a fresh session on the retry. Without this, retries would hammer the same failed IP.
class SmartRetryMiddleware:
"""Retry middleware that picks a *different* session on retry.
Replaces Scrapy's built-in RetryMiddleware with session-aware logic:
when a request fails, we drop the failed session from request.meta
so HexProxyMiddleware picks a fresh one.
"""
MAX_RETRIES = 4
RETRY_STATUSES = {403, 407, 408, 429, 500, 502, 503, 504}
def process_response(self, request: Request, response: Response, spider: Spider):
if response.status not in self.RETRY_STATUSES:
return response
retries = request.meta.get("retry_times", 0) + 1
if retries > self.MAX_RETRIES:
spider.crawler.stats.inc_value("hex_proxy/retry/max_exceeded")
return response # give up, return the bad response
new_request = request.copy()
new_request.meta["retry_times"] = retries
new_request.meta.pop("hex_session", None) # force new session pick
new_request.dont_filter = True
spider.crawler.stats.inc_value("hex_proxy/retry")
spider.logger.debug(
"retrying %s (attempt %d) after status %d",
request.url, retries, response.status,
)
return new_request
def process_exception(self, request: Request, exception, spider: Spider):
if not isinstance(exception, RETRYABLE_EXCEPTIONS):
return None
retries = request.meta.get("retry_times", 0) + 1
if retries > self.MAX_RETRIES:
return None
new_request = request.copy()
new_request.meta["retry_times"] = retries
new_request.meta.pop("hex_session", None)
new_request.dont_filter = True
spider.crawler.stats.inc_value("hex_proxy/retry")
return new_request
Stats integration
Scrapy's stats collector is the right place to surface proxy metrics. At the end of a crawl you'll see lines like hex_proxy/failure/429: 42, hex_proxy/blacklisted: 3, hex_proxy/retry: 187. These are invaluable for tuning HEX_PROXY_MAX_FAILURES and HEX_PROXY_BLACKLIST_COOLDOWN. If you see a lot of blacklists, your pool is too small or your max_failures is too low.
What to tune
Pool size: 20 is a good default for residential rotating. Larger pools spread failures thinner but cost more gateway connections. Max failures: 5 is conservative; 3 is aggressive. On hard targets, 3 gets you fresher IPs faster; on easy targets, 5 avoids over-rotation. Cooldown: should be longer than the target's per-IP block window — 5 minutes is fine for most sites, 15+ for aggressive targets.
For the architecture around Scrapy, see our distributed scraping pipeline guide and how proxy rotation works.