| /*************************************************************************** |
| * _ _ ____ _ |
| * Project ___| | | | _ \| | |
| * / __| | | | |_) | | |
| * | (__| |_| | _ <| |___ |
| * \___|\___/|_| \_\_____| |
| * |
| * Web crawler based on curl and libxml2. |
| * Copyright (C) 2018 Jeroen Ooms <[email protected]> |
| * License: MIT |
| * |
| * To compile: |
| * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) |
| * |
| */ |
| /* <DESC> |
| * Web crawler based on curl and libxml2 to stress-test curl with |
| * hundreds of concurrent connections to various servers. |
| * </DESC> |
| */ |
| |
| /* Parameters */ |
| int max_con = 200; |
| int max_total = 20000; |
| int max_requests = 500; |
| int max_link_per_page = 5; |
| int follow_relative_links = 0; |
| char *start_page = "https://www.reuters.com"; |
| |
| #include <libxml/HTMLparser.h> |
| #include <libxml/xpath.h> |
| #include <libxml/uri.h> |
| #include <curl/curl.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <math.h> |
| #include <signal.h> |
| |
| int pending_interrupt = 0; |
| void sighandler(int dummy) |
| { |
| pending_interrupt = 1; |
| } |
| |
| /* resizable buffer */ |
| typedef struct { |
| char *buf; |
| size_t size; |
| } memory; |
| |
| size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) |
| { |
| size_t realsize = sz * nmemb; |
| memory *mem = (memory*) ctx; |
| char *ptr = realloc(mem->buf, mem->size + realsize); |
| if(!ptr) { |
| /* out of memory */ |
| printf("not enough memory (realloc returned NULL)\n"); |
| return 0; |
| } |
| mem->buf = ptr; |
| memcpy(&(mem->buf[mem->size]), contents, realsize); |
| mem->size += realsize; |
| return realsize; |
| } |
| |
| CURL *make_handle(char *url) |
| { |
| CURL *handle = curl_easy_init(); |
| |
| /* Important: use HTTP2 over HTTPS */ |
| curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); |
| curl_easy_setopt(handle, CURLOPT_URL, url); |
| |
| /* buffer body */ |
| memory *mem = malloc(sizeof(memory)); |
| mem->size = 0; |
| mem->buf = malloc(1); |
| curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); |
| curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); |
| curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); |
| |
| /* For completeness */ |
| curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); |
| curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); |
| curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); |
| curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); |
| curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L); |
| curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); |
| curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); |
| curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); |
| curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); |
| curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); |
| curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); |
| curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); |
| return handle; |
| } |
| |
| /* HREF finder implemented in libxml2 but could be any HTML parser */ |
| size_t follow_links(CURLM *multi_handle, memory *mem, char *url) |
| { |
| int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ |
| HTML_PARSE_NOWARNING | HTML_PARSE_NONET; |
| htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); |
| if(!doc) |
| return 0; |
| xmlChar *xpath = (xmlChar*) "//a/@href"; |
| xmlXPathContextPtr context = xmlXPathNewContext(doc); |
| xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); |
| xmlXPathFreeContext(context); |
| if(!result) |
| return 0; |
| xmlNodeSetPtr nodeset = result->nodesetval; |
| if(xmlXPathNodeSetIsEmpty(nodeset)) { |
| xmlXPathFreeObject(result); |
| return 0; |
| } |
| size_t count = 0; |
| for(int i = 0; i < nodeset->nodeNr; i++) { |
| double r = rand(); |
| int x = r * nodeset->nodeNr / RAND_MAX; |
| const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; |
| xmlChar *href = xmlNodeListGetString(doc, node, 1); |
| if(follow_relative_links) { |
| xmlChar *orig = href; |
| href = xmlBuildURI(href, (xmlChar *) url); |
| xmlFree(orig); |
| } |
| char *link = (char *) href; |
| if(!link || strlen(link) < 20) |
| continue; |
| if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { |
| curl_multi_add_handle(multi_handle, make_handle(link)); |
| if(count++ == max_link_per_page) |
| break; |
| } |
| xmlFree(link); |
| } |
| xmlXPathFreeObject(result); |
| return count; |
| } |
| |
| int is_html(char *ctype) |
| { |
| return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); |
| } |
| |
| int main(void) |
| { |
| signal(SIGINT, sighandler); |
| LIBXML_TEST_VERSION; |
| curl_global_init(CURL_GLOBAL_DEFAULT); |
| CURLM *multi_handle = curl_multi_init(); |
| curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); |
| curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); |
| |
| /* enables http/2 if available */ |
| #ifdef CURLPIPE_MULTIPLEX |
| curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); |
| #endif |
| |
| /* sets html start page */ |
| curl_multi_add_handle(multi_handle, make_handle(start_page)); |
| |
| int msgs_left; |
| int pending = 0; |
| int complete = 0; |
| int still_running = 1; |
| while(still_running && !pending_interrupt) { |
| int numfds; |
| curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); |
| curl_multi_perform(multi_handle, &still_running); |
| |
| /* See how the transfers went */ |
| CURLMsg *m = NULL; |
| while((m = curl_multi_info_read(multi_handle, &msgs_left))) { |
| if(m->msg == CURLMSG_DONE) { |
| CURL *handle = m->easy_handle; |
| char *url; |
| memory *mem; |
| curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); |
| curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); |
| if(m->data.result == CURLE_OK) { |
| long res_status; |
| curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); |
| if(res_status == 200) { |
| char *ctype; |
| curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); |
| printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); |
| if(is_html(ctype) && mem->size > 100) { |
| if(pending < max_requests && (complete + pending) < max_total) { |
| pending += follow_links(multi_handle, mem, url); |
| still_running = 1; |
| } |
| } |
| } |
| else { |
| printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); |
| } |
| } |
| else { |
| printf("[%d] Connection failure: %s\n", complete, url); |
| } |
| curl_multi_remove_handle(multi_handle, handle); |
| curl_easy_cleanup(handle); |
| free(mem->buf); |
| free(mem); |
| complete++; |
| pending--; |
| } |
| } |
| } |
| curl_multi_cleanup(multi_handle); |
| curl_global_cleanup(); |
| return 0; |
| } |