From a85b163ee9f45df275de3057f3a84b428344daf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Tue, 9 Sep 2014 13:18:32 +0200 Subject: [PATCH] added --follow-tags and --ignore-tags --- examples/print_html_urls.c | 2 +- include/libmget.h | 9 +- libmget/html_url.c | 52 ++++++--- src/mget.c | 4 +- src/options.c | 79 +++++++++++++- src/options.h | 4 +- tests/Makefile.am | 2 +- tests/test--accept.c | 4 +- tests/test--follow-tags.c | 215 +++++++++++++++++++++++++++++++++++++ 9 files changed, 348 insertions(+), 23 deletions(-) create mode 100644 tests/test--follow-tags.c diff --git a/examples/print_html_urls.c b/examples/print_html_urls.c index 14e22f61..09ab9cd7 100644 --- a/examples/print_html_urls.c +++ b/examples/print_html_urls.c @@ -39,7 +39,7 @@ static void html_parse_localfile(const char *fname) char *data; if ((data = mget_read_file(fname, NULL))) { - MGET_HTML_PARSED_RESULT *res = mget_html_get_urls_inline(data); + MGET_HTML_PARSED_RESULT *res = mget_html_get_urls_inline(data, NULL, NULL); if (res->encoding) printf("URI encoding '%s'\n", res->encoding); diff --git a/include/libmget.h b/include/libmget.h index 53f0be16..bf94c14a 100644 --- a/include/libmget.h +++ b/include/libmget.h @@ -986,8 +986,15 @@ typedef struct { follow; } MGET_HTML_PARSED_RESULT; +typedef struct { + const char * + name; + const char * + attribute; +} mget_html_tag_t; + MGET_HTML_PARSED_RESULT * - mget_html_get_urls_inline(const char *html); + mget_html_get_urls_inline(const char *html, mget_vector_t *additional_tags, mget_vector_t *ignore_tags); void mget_html_free_urls_inline(MGET_HTML_PARSED_RESULT **res); diff --git a/libmget/html_url.c b/libmget/html_url.c index 2f5d4dab..a0a78667 100644 --- a/libmget/html_url.c +++ b/libmget/html_url.c @@ -39,10 +39,14 @@ typedef struct { MGET_HTML_PARSED_RESULT result; + mget_vector_t * + additional_tags; + mget_vector_t * + ignore_tags; char found_robots, found_content_type; -} _HTML_CONTEXT; +} _html_context_t; // see http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value static const char maybe[256] = { @@ -75,9 +79,9 @@ static const char attrs[][12] = { }; // Callback function, called from HTML parser for each URI found. -static void _html_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_MGET_UNUSED) +static void _html_get_url(void *context, int flags, const char *tag, const char *attr, const char *val, size_t len, size_t pos G_GNUC_MGET_UNUSED) { - _HTML_CONTEXT *ctx = context; + _html_context_t *ctx = context; // Read the encoding from META tag, e.g. from // . @@ -85,16 +89,16 @@ static void _html_get_url(void *context, int flags, const char *dir, const char // // Also ,we are interested in ROBOTS e.g. // - if ((flags & XML_FLG_BEGIN) && (*dir|0x20) == 'm' && !strcasecmp(dir, "meta")) { + if ((flags & XML_FLG_BEGIN) && (*tag|0x20) == 'm' && !strcasecmp(tag, "meta")) { ctx->found_robots = ctx->found_content_type = 0; } if ((flags & XML_FLG_ATTRIBUTE) && val) { - MGET_HTML_PARSED_RESULT *res = &ctx->result; + MGET_HTML_PARSED_RESULT *res = &ctx->result; // info_printf("%02X %s %s '%.*s' %zd %zd\n", flags, dir, attr, (int) len, val, len, pos); - if ((*dir|0x20) == 'm' && !strcasecmp(dir, "meta")) { + if ((*tag|0x20) == 'm' && !strcasecmp(tag, "meta")) { if (!ctx->found_robots) { if (!strcasecmp(attr, "name") && !strncasecmp(val, "robots", len)) { ctx->found_robots = 1; @@ -145,15 +149,31 @@ static void _html_get_url(void *context, int flags, const char *dir, const char return; } - // shortcut to avoid unneeded calls to bsearch() - if (!maybe[(unsigned char)*attr|0x20] || !attr[1] || !attr[2]) - return; + if (ctx->ignore_tags) { + if (mget_vector_find(ctx->ignore_tags, &(mget_html_tag_t){ .name = tag, .attribute = NULL } ) != -1 + || mget_vector_find(ctx->ignore_tags, &(mget_html_tag_t){ .name = tag, .attribute = attr } ) != -1) + return; + } - if (bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))strcasecmp)) { + // shortcut to avoid unneeded calls to bsearch() + int found = 0; + + // search the static list for a tag/attr match + if (maybe[(unsigned char)*attr|0x20] && attr[1] && attr[2]) + found = bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))strcasecmp) != NULL; + + // search the dynamic list for a tag/attr match + if (!found && ctx->additional_tags) { + if (mget_vector_find(ctx->additional_tags, &(mget_html_tag_t){ .name = tag, .attribute = NULL } ) != -1 + || mget_vector_find(ctx->additional_tags, &(mget_html_tag_t){ .name = tag, .attribute = attr } ) != -1) + found = 1; + } + + if (found) { for (;len && isspace(*val); val++, len--); // skip leading spaces for (;len && isspace(val[len - 1]); len--); // skip trailing spaces - if ((*dir|0x20) == 'd' && !strcasecmp(dir,"base")) { + if ((*tag|0x20) == 'b' && !strcasecmp(tag,"base")) { // found a res->base.p = val; res->base.len = len; @@ -165,7 +185,7 @@ static void _html_get_url(void *context, int flags, const char *dir, const char MGET_HTML_PARSED_URL url; strlcpy(url.attr, attr, sizeof(url.attr)); - strlcpy(url.dir, dir, sizeof(url.dir)); + strlcpy(url.dir, tag, sizeof(url.dir)); url.url.p = val; url.url.len = len; mget_vector_add(res->uris, &url, sizeof(url)); @@ -203,9 +223,13 @@ void mget_html_free_urls_inline (MGET_HTML_PARSED_RESULT **res) } } -MGET_HTML_PARSED_RESULT *mget_html_get_urls_inline(const char *html) +MGET_HTML_PARSED_RESULT *mget_html_get_urls_inline(const char *html, mget_vector_t *additional_tags, mget_vector_t *ignore_tags) { - _HTML_CONTEXT context = { .result.follow = 1 }; + _html_context_t context = { + .result.follow = 1, + .additional_tags = additional_tags, + .ignore_tags = ignore_tags + }; // context.result.uris = mget_vector_create(32, -2, NULL); mget_html_parse_buffer(html, _html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT); diff --git a/src/mget.c b/src/mget.c index 72f17f53..e01911ed 100644 --- a/src/mget.c +++ b/src/mget.c @@ -1007,7 +1007,7 @@ void *downloader_thread(void *p) job->head_first = 1; // enable mime-type check to assure e.g. text/html to be downloaded and parsed } - info_printf("head_first=%d deferred=%d iri=%s\n", job->head_first, !!job->deferred, job->iri->uri); +// info_printf("head_first=%d deferred=%d iri=%s\n", job->head_first, !!job->deferred, job->iri->uri); if ((config.spider || config.chunk_size || job->head_first) && !job->deferred) { // In spider mode, we first make a HEAD request. // If the Content-Type header gives us not a parsable type, we are done. @@ -1345,7 +1345,7 @@ static unsigned int G_GNUC_MGET_PURE hash_url(const char *url) void html_parse(JOB *job, int level, const char *html, const char *encoding, mget_iri_t *base) { - MGET_HTML_PARSED_RESULT *parsed = mget_html_get_urls_inline(html); + MGET_HTML_PARSED_RESULT *parsed = mget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags); mget_iri_t *allocated_base = NULL; const char *reason; mget_buffer_t buf; diff --git a/src/options.c b/src/options.c index cd7d5e37..ae64594c 100644 --- a/src/options.c +++ b/src/options.c @@ -160,6 +160,8 @@ static int G_GNUC_MGET_NORETURN print_help(G_GNUC_MGET_UNUSED option_t opt, G_GN " --robots Respect robots.txt standard for recursive downloads. (default: on)\n" " --restrict-file-names unix, windows, nocontrol, ascii, lowercase, uppercase, none\n" " -m --mirror Turn on mirroring options -r -N -l inf\n" + " --follow-tags Scan additional tag/attributes for URLs, e.g. --follow-tags=\"img/data-500px,img/data-hires\n" + " --ignore-tags Ignore tag/attributes for URL scanning, e.g. --ignore-tags=\"img,a/href\n" "\n"); puts( "HTTP related options:\n" @@ -300,7 +302,7 @@ static int parse_stringlist(option_t opt, G_GNUC_MGET_UNUSED const char *const * { mget_vector_t *v = *((mget_vector_t **)opt->var); - if (val) { + if (val && *val) { const char *s, *p; if (!v) @@ -319,6 +321,77 @@ static int parse_stringlist(option_t opt, G_GNUC_MGET_UNUSED const char *const * return 0; } +static void _free_tag(mget_html_tag_t *tag) +{ + if (tag) { + xfree(tag->attribute); + xfree(tag->name); + } +} + +static void G_GNUC_MGET_NONNULL_ALL _add_tag(mget_vector_t *v, const char *begin, const char *end) +{ + mget_html_tag_t tag; + const char *attribute; + + if ((attribute = memchr(begin, '/', end - begin))) { + tag.name = strndup(begin, attribute - begin); + tag.attribute = strndup(attribute + 1, (end - begin) - (attribute - begin) - 1); + } else { + tag.name = strndup(begin, end - begin); + tag.attribute = NULL; + } + + if (mget_vector_find(v, &tag) == -1) + mget_vector_insert_sorted(v, &tag, sizeof(tag)); + else + _free_tag(&tag); // avoid double entries +} + +static int G_GNUC_MGET_NONNULL_ALL _compare_tag(const mget_html_tag_t *t1, const mget_html_tag_t *t2) +{ + int n; + + if (!(n = strcasecmp(t1->name, t2->name))) { + if (!t1->attribute) { + if (!t2->attribute) + n = 0; + else + n = -1; + } else if (!t2->attribute) { + n = 1; + } else + n = strcasecmp(t1->attribute, t2->attribute); + } + + return n; +} + +static int parse_taglist(option_t opt, G_GNUC_MGET_UNUSED const char *const *argv, const char *val) +{ + mget_vector_t *v = *((mget_vector_t **)opt->var); + + if (val && *val) { + const char *s, *p; + + if (!v) { + v = *((mget_vector_t **)opt->var) = mget_vector_create(8, -2, (int(*)(const void *, const void *))_compare_tag); + mget_vector_set_destructor(v, (void(*)(void *))_free_tag); + } + + for (s = val; (p = strchr(s, ',')); s = p + 1) { + if (p != s) + _add_tag(v, s, p); + } + if (*s) + _add_tag(v, s, s + strlen(s)); + } else { + mget_vector_free(&v); + } + + return 0; +} + static int parse_bool(option_t opt, G_GNUC_MGET_UNUSED const char *const *argv, const char *val) { if (opt->var) { @@ -548,6 +621,7 @@ static const struct option options[] = { { "egd-file", &config.egd_file, parse_string, 1, 0 }, { "exclude-domains", &config.exclude_domains, parse_stringset, 1, 0 }, { "execute", NULL, parse_execute, 1, 'e' }, + { "follow-tags", &config.follow_tags, parse_taglist, 1, 0 }, { "force-atom", &config.force_atom, parse_bool, 0, 0 }, { "force-css", &config.force_css, parse_bool, 0, 0 }, { "force-directories", &config.force_directories, parse_bool, 0, 'x' }, @@ -567,6 +641,7 @@ static const struct option options[] = { { "https-only", &config.https_only, parse_bool, 0, 0 }, { "https-proxy", &config.https_proxy, parse_string, 1, 0 }, { "ignore-case", &config.ignore_case, parse_bool, 0, 0 }, + { "ignore-tags", &config.ignore_tags, parse_taglist, 1, 0 }, { "inet4-only", &config.inet4_only, parse_bool, 0, '4' }, { "inet6-only", &config.inet6_only, parse_bool, 0, '6' }, { "input-encoding", &config.input_encoding, parse_string, 1, 0 }, @@ -1278,6 +1353,8 @@ void deinit(void) mget_stringmap_free(&config.domains); mget_stringmap_free(&config.exclude_domains); + mget_vector_free(&config.follow_tags); + mget_vector_free(&config.ignore_tags); mget_http_set_http_proxy(NULL, NULL); mget_http_set_https_proxy(NULL, NULL); diff --git a/src/options.h b/src/options.h index 44b393c6..53966363 100644 --- a/src/options.h +++ b/src/options.h @@ -80,7 +80,9 @@ struct config { *exclude_domains; mget_vector_t *accept_patterns, - *reject_patterns; + *reject_patterns, + *follow_tags, + *ignore_tags; mget_hsts_db_t *hsts_db; // in-memory HSTS database size_t diff --git a/tests/Makefile.am b/tests/Makefile.am index e84fb72e..ce83eb20 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -9,7 +9,7 @@ MGET_TESTS = test test-mget-1 test-restrict-ascii test-i-http test-i-https test- test-meta-robots test-idn-robots test-idn-meta test-idn-cmd \ test-iri test-iri-percent test-iri-list test-iri-forced-remote \ test-auth-basic test-parse-html test-parse-rss test--page-requisites test--accept \ - test-k + test-k test--follow-tags #test--post-file test-E-k diff --git a/tests/test--accept.c b/tests/test--accept.c index 7cfde01a..d659ed1a 100644 --- a/tests/test--accept.c +++ b/tests/test--accept.c @@ -93,7 +93,7 @@ int main(void) // --accept using just suffixes mget_test( - MGET_TEST_OPTIONS, "--num-threads=1 -r -nH --accept '.jpeg'", + MGET_TEST_OPTIONS, "-r -nH --accept '.jpeg'", MGET_TEST_REQUEST_URL, "index.html", MGET_TEST_EXPECTED_ERROR_CODE, 0, MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { @@ -105,7 +105,7 @@ int main(void) // --reject using just suffixes mget_test( - MGET_TEST_OPTIONS, "--num-threads=1 -r -nH --reject '.jpeg'", + MGET_TEST_OPTIONS, "-r -nH --reject '.jpeg'", MGET_TEST_REQUEST_URL, "index.html", MGET_TEST_EXPECTED_ERROR_CODE, 0, MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { diff --git a/tests/test--follow-tags.c b/tests/test--follow-tags.c new file mode 100644 index 00000000..22e54441 --- /dev/null +++ b/tests/test--follow-tags.c @@ -0,0 +1,215 @@ +/* + * Copyright(c) 2013 Tim Ruehsen + * + * This file is part of libmget. + * + * Libmget is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Libmget is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libmget. If not, see . + * + * + * Testing Mget + * + * Changelog + * 09.09.2014 Tim Ruehsen created + * + */ + +#if HAVE_CONFIG_H +# include +#endif + +#include // exit() +#include "libtest.h" + +int main(void) +{ + mget_test_url_t urls[]={ + { .name = "/index.html", + .code = "200 Dontcare", + .body = + "Main Page

A link to a" \ + " second page." \ + "

", + .headers = { + "Content-Type: text/html", + } + }, + { .name = "/secondpage.html", + .code = "200 Dontcare", + .body = + "Second Page

A link to a" \ + " Picture 2a." \ + " " \ + "

", + .headers = { + "Content-Type: text/html", + } + }, + { .name = "/2a.jpeg", + .code = "200 Dontcare", + .body = "pic 2a", + .headers = { "Content-Type: image/jpeg" } + }, + { .name = "/2b.jpeg", + .code = "200 Dontcare", + .body = "pic 2b", + .headers = { "Content-Type: image/jpeg" } + }, + { .name = "/2c.jpeg", + .code = "200 Dontcare", + .body = "pic 2c", + .headers = { "Content-Type: image/jpeg" } + }, + { .name = "/2d.jpeg", + .code = "200 Dontcare", + .body = "pic 2c", + .headers = { "Content-Type: image/jpeg" } + }, + }; + + // functions won't come back if an error occurs + mget_test_start_http_server( + MGET_TEST_RESPONSE_URLS, &urls, countof(urls), + 0); + + // without additional tags + mget_test( + MGET_TEST_OPTIONS, "-r -nH ", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { urls[3].name + 1, urls[3].body }, + { NULL } }, + 0); + + // --follow-tags single entry + mget_test( + MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img/data-500px'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { urls[3].name + 1, urls[3].body }, + { urls[4].name + 1, urls[4].body }, + { NULL } }, + 0); + + // --follow-tags single entry without attribute + mget_test( + MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { urls[3].name + 1, urls[3].body }, + { urls[4].name + 1, urls[4].body }, + { urls[5].name + 1, urls[5].body }, + { NULL } }, + 0); + + // --follow-tags two entries + mget_test( + MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img/data-500px,img/data-highres'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { urls[3].name + 1, urls[3].body }, + { urls[4].name + 1, urls[4].body }, + { urls[5].name + 1, urls[5].body }, + { NULL } }, + 0); + + // --follow-tags two entries + mget_test( + MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img/data-highres,img/data-500px'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { urls[3].name + 1, urls[3].body }, + { urls[4].name + 1, urls[4].body }, + { urls[5].name + 1, urls[5].body }, + { NULL } }, + 0); + + // --ignore-tags single entry + mget_test( + MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img/src'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { NULL } }, + 0); + + // --ignore-tags single entry without attribute + mget_test( + MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { NULL } }, + 0); + + // --ignore-tags two entries + mget_test( + MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img/src,a/href'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { NULL } }, + 0); + + // --ignore-tags two entries + mget_test( + MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'a/href,img/src'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { NULL } }, + 0); + + // --ignore-tags and --follow-tags combined + mget_test( + MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img/src' --follow-tags='img/data-500px'", + MGET_TEST_REQUEST_URL, "index.html", + MGET_TEST_EXPECTED_ERROR_CODE, 0, + MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) { + { urls[0].name + 1, urls[0].body }, + { urls[1].name + 1, urls[1].body }, + { urls[2].name + 1, urls[2].body }, + { urls[4].name + 1, urls[4].body }, + { NULL } }, + 0); + + exit(0); +}