Distributed Systems Patterns: Circuit Breaker and Retry Logic
Distributed systems fail. After building resilient microservices, I’ve learned that proper failure handling is critical. Here are the patterns that work in production.
The Problem
In distributed systems:
- Networks fail
- Services crash
- Timeouts occur
- Cascading failures happen
Without proper patterns, one failure can bring down the entire system.
Circuit Breaker Pattern
Concept
A circuit breaker:
- Open: Fails fast, doesn’t call service
- Closed: Normal operation
- Half-Open: Testing if service recovered
Implementation
class CircuitBreaker {
constructor(service, options = {}) {
this.service = service;
this.failureThreshold = options.failureThreshold || 5;
this.timeout = options.timeout || 60000; // 60 seconds
this.resetTimeout = options.resetTimeout || 30000; // 30 seconds
this.state = 'CLOSED';
this.failureCount = 0;
this.nextAttempt = Date.now();
}
async call(...args) {
if (this.state === 'OPEN') {
if (Date.now() < this.nextAttempt) {
throw new Error('Circuit breaker is OPEN');
}
// Try half-open
this.state = 'HALF_OPEN';
}
try {
const result = await this.service(...args);
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
onSuccess() {
this.failureCount = 0;
this.state = 'CLOSED';
}
onFailure() {
this.failureCount++;
if (this.failureCount >= this.failureThreshold) {
this.state = 'OPEN';
this.nextAttempt = Date.now() + this.resetTimeout;
}
}
getState() {
return {
state: this.state,
failureCount: this.failureCount,
nextAttempt: this.nextAttempt
};
}
}
Usage
const axios = require('axios');
async function fetchUser(userId) {
const response = await axios.get(`https://api.example.com/users/${userId}`);
return response.data;
}
const breaker = new CircuitBreaker(fetchUser, {
failureThreshold: 5,
resetTimeout: 30000
});
// Use circuit breaker
try {
const user = await breaker.call('123');
console.log(user);
} catch (error) {
if (error.message === 'Circuit breaker is OPEN') {
// Return cached data or default
return getCachedUser('123');
}
throw error;
}
Retry Logic
Exponential Backoff
async function retryWithBackoff(fn, options = {}) {
const maxRetries = options.maxRetries || 3;
const initialDelay = options.initialDelay || 1000;
const maxDelay = options.maxDelay || 30000;
const multiplier = options.multiplier || 2;
let lastError;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (error) {
lastError = error;
// Don't retry on certain errors
if (error.status === 400 || error.status === 401) {
throw error;
}
if (attempt < maxRetries) {
const delay = Math.min(
initialDelay * Math.pow(multiplier, attempt),
maxDelay
);
await sleep(delay);
}
}
}
throw lastError;
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
Jitter
function addJitter(delay, jitter = 0.1) {
const jitterAmount = delay * jitter * Math.random();
return delay + jitterAmount;
}
async function retryWithJitter(fn, options = {}) {
const maxRetries = options.maxRetries || 3;
const baseDelay = options.baseDelay || 1000;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (error) {
if (attempt < maxRetries) {
const delay = addJitter(baseDelay * Math.pow(2, attempt));
await sleep(delay);
} else {
throw error;
}
}
}
}
Timeout Pattern
function withTimeout(promise, timeoutMs) {
return Promise.race([
promise,
new Promise((_, reject) => {
setTimeout(() => {
reject(new Error('Operation timed out'));
}, timeoutMs);
})
]);
}
// Usage
try {
const result = await withTimeout(
fetchUser('123'),
5000 // 5 second timeout
);
} catch (error) {
if (error.message === 'Operation timed out') {
// Handle timeout
}
}
Bulkhead Pattern
Isolate resources to prevent cascading failures:
class Bulkhead {
constructor(maxConcurrency) {
this.maxConcurrency = maxConcurrency;
this.active = 0;
this.queue = [];
}
async execute(fn) {
return new Promise((resolve, reject) => {
if (this.active < this.maxConcurrency) {
this.run(fn, resolve, reject);
} else {
this.queue.push({ fn, resolve, reject });
}
});
}
async run(fn, resolve, reject) {
this.active++;
try {
const result = await fn();
resolve(result);
} catch (error) {
reject(error);
} finally {
this.active--;
if (this.queue.length > 0) {
const next = this.queue.shift();
this.run(next.fn, next.resolve, next.reject);
}
}
}
}
// Usage
const bulkhead = new Bulkhead(10); // Max 10 concurrent
await bulkhead.execute(() => fetchUser('123'));
Combining Patterns
class ResilientService {
constructor(service, options = {}) {
this.circuitBreaker = new CircuitBreaker(service, {
failureThreshold: options.failureThreshold || 5,
resetTimeout: options.resetTimeout || 30000
});
this.bulkhead = new Bulkhead(options.maxConcurrency || 10);
this.timeout = options.timeout || 5000;
}
async call(...args) {
return this.bulkhead.execute(async () => {
return withTimeout(
this.circuitBreaker.call(...args),
this.timeout
);
});
}
}
// Usage
const resilientService = new ResilientService(fetchUser, {
failureThreshold: 5,
resetTimeout: 30000,
maxConcurrency: 10,
timeout: 5000
});
try {
const user = await resilientService.call('123');
} catch (error) {
// Handle error gracefully
return getFallbackUser('123');
}
Node.js Libraries
Opossum (Circuit Breaker)
const CircuitBreaker = require('opossum');
const options = {
timeout: 3000,
errorThresholdPercentage: 50,
resetTimeout: 30000
};
const breaker = new CircuitBreaker(fetchUser, options);
breaker.on('open', () => console.log('Circuit breaker opened'));
breaker.on('halfOpen', () => console.log('Circuit breaker half-open'));
breaker.on('close', () => console.log('Circuit breaker closed'));
breaker.fallback(() => getCachedUser('123'));
const user = await breaker.fire('123');
Axios Retry
const axios = require('axios');
const axiosRetry = require('axios-retry');
const client = axios.create();
axiosRetry(client, {
retries: 3,
retryDelay: axiosRetry.exponentialDelay,
retryCondition: (error) => {
return axiosRetry.isNetworkOrIdempotentRequestError(error) ||
error.response?.status >= 500;
}
});
const response = await client.get('https://api.example.com/users/123');
Monitoring
class ResilientService {
constructor(service, options = {}) {
// ... existing code ...
this.metrics = {
totalCalls: 0,
failures: 0,
timeouts: 0,
circuitBreakerOpens: 0
};
}
async call(...args) {
this.metrics.totalCalls++;
const startTime = Date.now();
try {
const result = await this.execute(...args);
const duration = Date.now() - startTime;
this.recordSuccess(duration);
return result;
} catch (error) {
const duration = Date.now() - startTime;
this.recordFailure(error, duration);
throw error;
}
}
recordSuccess(duration) {
// Send to metrics system
metrics.histogram('service.call.duration', duration);
metrics.increment('service.call.success');
}
recordFailure(error, duration) {
if (error.message.includes('timeout')) {
this.metrics.timeouts++;
} else {
this.metrics.failures++;
}
metrics.increment('service.call.failure');
metrics.histogram('service.call.duration', duration);
}
}
Best Practices
- Use circuit breakers - Prevent cascading failures
- Implement retries - With exponential backoff
- Set timeouts - Don’t wait forever
- Use bulkheads - Isolate resources
- Monitor metrics - Track failures and latency
- Have fallbacks - Return cached/default data
- Fail fast - Don’t retry forever
- Test failures - Chaos engineering
Conclusion
Resilience patterns enable:
- Graceful failure handling
- Prevention of cascading failures
- Better user experience
- System stability
Combine circuit breakers, retries, timeouts, and bulkheads for production-ready services. The patterns shown here handle real-world failures.
Distributed systems resilience patterns from July 2018, covering production patterns.