Multithreaded HTTP Crawler
Python
Using requests
from concurrent.futures import ThreadPoolExecutor
import requests
URLS = (
"https://httpbin.org/html",
"https://httpbin.org/links/10/0",
"https://httpbin.org/robots.txt",
"https://httpbin.org/user-agent",
"https://httpbin.org/links/10/0",
"https://httpbin.org/robots.txt",
"https://httpbin.org/xml",
"https://httpbin.org/redirect/1",
"https://httpbin.org/redirect/2",
"https://httpbin.org/cookies",
"https://httpbin.org/basic-auth/user/passwd",
"https://httpbin.org/gzip",
)
def crawl_worker(url):
try:
print(f"Response of url: {url} is {requests.get(url).status_code}")
except Exception:
print("Failed to get url.")
if __name__ == "__main__":
with ThreadPoolExecutor() as executor:
executor.map(crawl_worker, URLS)
Rust
using reqwest
extern crate reqwest; use std::thread; fn crawl_worker(url: &str) { let parsed_url = reqwest::Url::parse(url).expect("Bad url format."); let response = reqwest::get(parsed_url).expect("Failed to get url."); println!("Response of url: {} is {:?}", url, response.status().to_string()); } fn main() { let urls = vec![ "https://httpbin.org/html", "https://httpbin.org/links/10/0", "https://httpbin.org/robots.txt", "https://httpbin.org/user-agent", "https://httpbin.org/links/10/0", "https://httpbin.org/robots.txt", "https://httpbin.org/xml", "https://httpbin.org/redirect/1", "https://httpbin.org/redirect/2", "https://httpbin.org/cookies", "https://httpbin.org/basic-auth/user/passwd", "https://httpbin.org/gzip", ]; let mut queue = vec![]; for url in urls { queue.push(thread::spawn(move || { crawl_worker(url); })); } for job in queue { let _ = job.join(); } }