diff --git a/examples/Makefile.am b/examples/Makefile.am index 64de6819..7c8d6bf6 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -1,4 +1,5 @@ noinst_PROGRAMS = \ + batch_loader\ getstream\ http_get2\ http_multi_get\ diff --git a/examples/batch_loader.c b/examples/batch_loader.c new file mode 100644 index 00000000..6e020af6 --- /dev/null +++ b/examples/batch_loader.c @@ -0,0 +1,256 @@ +/* + * Copyright(c) 2018 Free Software Foundation, Inc. + * + * This file is part of libwget. + * + * Libwget is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Libwget is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libwget. If not, see . + * + * + * Read URLs from stdin and download into results/domain/. + * + */ + +#include +#include +#include +#ifndef _WIN32 +# include +#endif +#include + +typedef struct { + int + http_links, https_links, + status, + redirs, + redir_insecure, + landed_on_https; + char + host[256], + content_type[128]; +} stats_t; + +#define MAXTHREADS 500 + +static void *downloader_thread(void *p); + +static void write_stats(const stats_t *stats) +{ + FILE *fp; + + if ((fp = fopen("out.csv", "a"))) { + fprintf(fp, "%s,%d,%d,%d,%d,%s\n", + stats->host, stats->status, stats->redir_insecure, stats->redirs, stats->landed_on_https, + stats->content_type); + fclose(fp); + } +} + +/* + * helper function: percent-unescape, convert to utf-8, create URL string using base + */ +static int _normalize_uri(wget_iri_t *base, wget_string_t *url, const char *encoding, wget_buffer_t *buf) +{ + char *urlpart_encoded; + size_t urlpart_encoded_length; + int rc; + + if (url->len == 0 || (url->len >= 1 && *url->p == '#')) // ignore e.g. href='#' + return -1; + + char *urlpart = wget_strmemdup(url->p, url->len); + + wget_iri_unescape_url_inline(urlpart); + rc = wget_memiconv(encoding, urlpart, strlen(urlpart), "utf-8", &urlpart_encoded, &urlpart_encoded_length); + wget_xfree(urlpart); + + if (rc) + return -2; + + rc = !wget_iri_relative_to_abs(base, urlpart_encoded, urlpart_encoded_length, buf); + wget_xfree(urlpart_encoded); + + if (rc) + return -3; + + return 0; +} + +static char *_normalize_location(const char *base, const char *url) +{ + wget_buffer_t buf; + wget_string_t url_s = { .p = url, .len = strlen(url) }; + wget_iri_t *base_iri = wget_iri_parse(base, "utf-8"); + char sbuf[1024], *norm_url = NULL; + int rc; + + if (!base_iri) + return NULL; + + wget_buffer_init(&buf, sbuf, sizeof(sbuf)); + if ((rc = _normalize_uri(base_iri, &url_s, "utf-8", &buf)) == 0) { + norm_url = wget_strmemdup(buf.data, buf.length); + } + wget_buffer_deinit(&buf); + + wget_iri_free(&base_iri); + + return norm_url; +} + +int main(int argc G_GNUC_WGET_UNUSED, const char *const *argv G_GNUC_WGET_UNUSED) +{ + static wget_thread_t downloaders[MAXTHREADS]; + + // set up libwget global configuration + wget_global_init( +// WGET_DEBUG_STREAM, stderr, + WGET_ERROR_STREAM, stdout, + WGET_INFO_STREAM, stdout, + WGET_DNS_CACHING, 0, + 0); + +#ifndef _WIN32 + struct sigaction sig_action; + memset(&sig_action, 0, sizeof(sig_action)); + sig_action.sa_sigaction = (void (*)(int, siginfo_t *, void *))SIG_IGN; + sigaction(SIGPIPE, &sig_action, NULL); // this forces socket error return +#endif + + // set global timeouts to 5s + wget_tcp_set_timeout(NULL, 3000); + wget_tcp_set_connect_timeout(NULL, 3000); + wget_tcp_set_dns_timeout(NULL, 3000); + + // OCSP off + wget_ssl_set_config_int(WGET_SSL_OCSP, 0); + wget_ssl_set_config_int(WGET_SSL_OCSP_STAPLING, 0); + + // don't check cert and SNI + wget_ssl_set_config_int(WGET_SSL_CHECK_CERTIFICATE, 0); + wget_ssl_set_config_int(WGET_SSL_CHECK_HOSTNAME, 0); + + // start threads + for (int rc, it = 0; it < MAXTHREADS; it++) { + if ((rc = wget_thread_start(&downloaders[it], downloader_thread, NULL, 0)) != 0) { + wget_error_printf("Failed to start thread, error %d\n", rc); + } + } + + // wait until threads are done + for (int rc, it = 0; it < MAXTHREADS; it++) { + if ((rc = wget_thread_join(&downloaders[it])) != 0) + wget_error_printf("Failed to wait for downloader #%d (%d %d)\n", it, rc, errno); + } + + // free resources - needed for valgrind testing + wget_global_deinit(); + + return 0; +} + +static void *downloader_thread(G_GNUC_WGET_UNUSED void *p) +{ + stats_t stats; + wget_http_response_t *resp = NULL; + char *url = NULL; + + while (fscanf(stdin, "%255s", stats.host) == 1) { + wget_xfree(url); + + if (!wget_strncasecmp_ascii(stats.host, "http://", 7)) + url = wget_strdup(stats.host); + else if (!wget_strncasecmp_ascii(stats.host, "https://", 8)) + url = wget_strdup(stats.host); + else + url = wget_aprintf("http://%s", stats.host); + + stats.http_links = stats. https_links = 0; + stats.status = -1; + stats.redirs = stats.redir_insecure = stats.landed_on_https = 0; + *stats.content_type = 0; + + // follow up to max 5 redirections, stop if one is plain text + for (int redirs = 0, max = 5; redirs < max; redirs++) { + + wget_http_free_response(&resp); +// wget_http_close(&conn); + + wget_info_printf("%s%s\n", redirs ? " -> " : "", url); + + // execute an HTTP GET request and return the response + resp = wget_http_get( + WGET_HTTP_URL, url, + WGET_HTTP_HEADER_ADD, "User-Agent", "Mozilla/5.0", + WGET_HTTP_HEADER_ADD, "Accept-Encoding", "gzip, br", + WGET_HTTP_HEADER_ADD, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", /* some sites need this */ + WGET_HTTP_HEADER_ADD, "Accept-Encoding", "gzip, br", +// WGET_HTTP_HEADER_ADD, "Upgrade-Insecure-Requests", "1", + WGET_HTTP_MAX_REDIRECTIONS, 0, +// WGET_HTTP_CONNECTION_PTR, &conn, + 0); + + if (!resp) { + wget_info_printf(" No connection / response\n"); + break; + } + + snprintf(stats.content_type, sizeof(stats.content_type), "%s", resp->content_type); + + stats.status = resp->code; + if (resp->code != 200) { + if (resp->location) { + stats.redirs++; + + wget_info_printf(" Response code %hd, %s\n", resp->code, resp->location); + + char *newurl = _normalize_location(url, resp->location); + if (!newurl) { + wget_info_printf(" Failed to normalize '%s', '%s'\n", url, resp->location); + break; + } + wget_xfree(url); + url = newurl; + + if (wget_strncasecmp(url, "https://", 8)) + stats.redir_insecure++; + + continue; + } + + wget_info_printf(" Response code %hd\n", resp->code); + break; + } + + if (wget_strncasecmp(url, "https://", 8)) + break; // no need to parse, we landed on HTTP + + stats.landed_on_https = 1; + + break; + } + + // free the response + wget_http_free_response(&resp); + + // close connection if still open +// wget_http_close(&conn); + + write_stats(&stats); + } + + wget_xfree(url); + + return NULL; +}