/* * Copyright (c) 2018-2024 Free Software Foundation, Inc. * * This file is part of libwget. * * Libwget is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Libwget is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libwget. If not, see . * * * Read URLs from stdin and download into results/domain/. * */ #include #include #include #ifndef _WIN32 # include #endif #include typedef struct { int http_links, https_links, status, redirs, redir_insecure, landed_on_https; char host[256], content_type[128]; } stats_t; #define MAXTHREADS 500 static void *downloader_thread(void *p); static void write_stats(const stats_t *stats) { FILE *fp; if ((fp = fopen("out.csv", "a"))) { fprintf(fp, "%s,%d,%d,%d,%d,%s\n", stats->host, stats->status, stats->redir_insecure, stats->redirs, stats->landed_on_https, stats->content_type); fclose(fp); } } /* * helper function: percent-unescape, convert to utf-8, create URL string using base */ static int _normalize_uri(wget_iri *base, wget_string *url, const char *encoding, wget_buffer *buf) { char *urlpart_encoded; size_t urlpart_encoded_length; int rc; if (url->len == 0 || (url->len >= 1 && *url->p == '#')) // ignore e.g. href='#' return -1; char *urlpart = wget_strmemdup(url->p, url->len); if (!urlpart) return -2; wget_iri_unescape_url_inline(urlpart); rc = wget_memiconv(encoding, urlpart, strlen(urlpart), "utf-8", &urlpart_encoded, &urlpart_encoded_length); wget_xfree(urlpart); if (rc) return -3; rc = !wget_iri_relative_to_abs(base, urlpart_encoded, urlpart_encoded_length, buf); wget_xfree(urlpart_encoded); if (rc) return -4; return 0; } static char *_normalize_location(const char *base, const char *url) { wget_buffer buf; wget_string url_s = { .p = url, .len = strlen(url) }; wget_iri *base_iri = wget_iri_parse(base, "utf-8"); char sbuf[1024], *norm_url = NULL; if (!base_iri) return NULL; wget_buffer_init(&buf, sbuf, sizeof(sbuf)); if (_normalize_uri(base_iri, &url_s, "utf-8", &buf) == 0) { norm_url = wget_strmemdup(buf.data, buf.length); } wget_buffer_deinit(&buf); wget_iri_free(&base_iri); return norm_url; } int main(int argc WGET_GCC_UNUSED, const char *const *argv WGET_GCC_UNUSED) { static wget_thread downloaders[MAXTHREADS]; // set up libwget global configuration wget_global_init( // WGET_DEBUG_STREAM, stderr, WGET_ERROR_STREAM, stdout, WGET_INFO_STREAM, stdout, WGET_DNS_CACHING, 0, 0); #ifndef _WIN32 struct sigaction sig_action = { .sa_handler = SIG_IGN }; sigaction(SIGPIPE, &sig_action, NULL); // this forces socket error return #endif // set global timeouts to 5s wget_tcp_set_timeout(NULL, 3000); wget_tcp_set_connect_timeout(NULL, 3000); // OCSP off wget_ssl_set_config_int(WGET_SSL_OCSP, 0); wget_ssl_set_config_int(WGET_SSL_OCSP_STAPLING, 0); // don't check cert and SNI wget_ssl_set_config_int(WGET_SSL_CHECK_CERTIFICATE, 0); wget_ssl_set_config_int(WGET_SSL_CHECK_HOSTNAME, 0); // start threads for (int rc, it = 0; it < MAXTHREADS; it++) { if ((rc = wget_thread_start(&downloaders[it], downloader_thread, NULL, 0)) != 0) { wget_error_printf("Failed to start thread, error %d\n", rc); } } // wait until threads are done for (int rc, it = 0; it < MAXTHREADS; it++) { if ((rc = wget_thread_join(&downloaders[it])) != 0) wget_error_printf("Failed to wait for downloader #%d (%d %d)\n", it, rc, errno); } // free resources - needed for valgrind testing wget_global_deinit(); return 0; } static void *downloader_thread(WGET_GCC_UNUSED void *p) { stats_t stats; wget_http_response *resp = NULL; char *url = NULL; while (fscanf(stdin, "%255s", stats.host) == 1) { wget_xfree(url); if (!wget_strncasecmp_ascii(stats.host, "http://", 7)) url = wget_strdup(stats.host); else if (!wget_strncasecmp_ascii(stats.host, "https://", 8)) url = wget_strdup(stats.host); else url = wget_aprintf("http://%s", stats.host); stats.http_links = stats. https_links = 0; stats.status = -1; stats.redirs = stats.redir_insecure = stats.landed_on_https = 0; *stats.content_type = 0; // follow up to max 5 redirections, stop if one is plain text for (int redirs = 0, max = 5; redirs < max; redirs++) { wget_http_free_response(&resp); // wget_http_close(&conn); wget_info_printf("%s%s\n", redirs ? " -> " : "", url); // execute an HTTP GET request and return the response resp = wget_http_get( WGET_HTTP_URL, url, WGET_HTTP_HEADER_ADD, "User-Agent", "Mozilla/5.0", WGET_HTTP_HEADER_ADD, "Accept-Encoding", "gzip, br", WGET_HTTP_HEADER_ADD, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", /* some sites need this */ WGET_HTTP_HEADER_ADD, "Accept-Encoding", "gzip, br", // WGET_HTTP_HEADER_ADD, "Upgrade-Insecure-Requests", "1", WGET_HTTP_MAX_REDIRECTIONS, 0, // WGET_HTTP_CONNECTION_PTR, &conn, 0); if (!resp) { wget_info_printf(" No connection / response\n"); break; } snprintf(stats.content_type, sizeof(stats.content_type), "%s", resp->content_type); stats.status = resp->code; if (resp->code != 200) { if (resp->location) { stats.redirs++; wget_info_printf(" Response code %hd, %s\n", resp->code, resp->location); char *newurl = _normalize_location(url, resp->location); if (!newurl) { wget_info_printf(" Failed to normalize '%s', '%s'\n", url, resp->location); break; } wget_xfree(url); url = newurl; if (wget_strncasecmp(url, "https://", 8)) stats.redir_insecure++; continue; } wget_info_printf(" Response code %hd\n", resp->code); break; } if (wget_strncasecmp(url, "https://", 8)) break; // no need to parse, we landed on HTTP stats.landed_on_https = 1; break; } // free the response wget_http_free_response(&resp); // close connection if still open // wget_http_close(&conn); write_stats(&stats); } wget_xfree(url); return NULL; }