From a85b163ee9f45df275de3057f3a84b428344daf1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Tue, 9 Sep 2014 13:18:32 +0200
Subject: [PATCH] added --follow-tags and --ignore-tags

---
 examples/print_html_urls.c |   2 +-
 include/libmget.h          |   9 +-
 libmget/html_url.c         |  52 ++++++---
 src/mget.c                 |   4 +-
 src/options.c              |  79 +++++++++++++-
 src/options.h              |   4 +-
 tests/Makefile.am          |   2 +-
 tests/test--accept.c       |   4 +-
 tests/test--follow-tags.c  | 215 +++++++++++++++++++++++++++++++++++++
 9 files changed, 348 insertions(+), 23 deletions(-)
 create mode 100644 tests/test--follow-tags.c
diff --git a/examples/print_html_urls.c b/examples/print_html_urls.c
index 14e22f61..09ab9cd7 100644
--- a/examples/print_html_urls.c
+++ b/examples/print_html_urls.c
@@ -39,7 +39,7 @@ static void html_parse_localfile(const char *fname)
 	char *data;
 
 	if ((data = mget_read_file(fname, NULL))) {
-		MGET_HTML_PARSED_RESULT *res  = mget_html_get_urls_inline(data);
+		MGET_HTML_PARSED_RESULT *res  = mget_html_get_urls_inline(data, NULL, NULL);
 
 		if (res->encoding)
 			printf("URI encoding '%s'\n", res->encoding);
diff --git a/include/libmget.h b/include/libmget.h
index 53f0be16..bf94c14a 100644
--- a/include/libmget.h
+++ b/include/libmget.h
@@ -986,8 +986,15 @@ typedef struct {
 		follow;
 } MGET_HTML_PARSED_RESULT;
 
+typedef struct {
+	const char *
+		name;
+	const char *
+		attribute;
+} mget_html_tag_t;
+
 MGET_HTML_PARSED_RESULT *
-	mget_html_get_urls_inline(const char *html);
+	mget_html_get_urls_inline(const char *html, mget_vector_t *additional_tags, mget_vector_t *ignore_tags);
 void
 	mget_html_free_urls_inline(MGET_HTML_PARSED_RESULT **res);
 
diff --git a/libmget/html_url.c b/libmget/html_url.c
index 2f5d4dab..a0a78667 100644
--- a/libmget/html_url.c
+++ b/libmget/html_url.c
@@ -39,10 +39,14 @@
 typedef struct {
 	MGET_HTML_PARSED_RESULT
 		result;
+	mget_vector_t *
+		additional_tags;
+	mget_vector_t *
+		ignore_tags;
 	char
 		found_robots,
 		found_content_type;
-} _HTML_CONTEXT;
+} _html_context_t;
 
 // see http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
 static const char maybe[256] = {
@@ -75,9 +79,9 @@ static const char attrs[][12] = {
 };
 
 // Callback function, called from HTML parser for each URI found.
-static void _html_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_MGET_UNUSED)
+static void _html_get_url(void *context, int flags, const char *tag, const char *attr, const char *val, size_t len, size_t pos G_GNUC_MGET_UNUSED)
 {
-	_HTML_CONTEXT *ctx = context;
+	_html_context_t *ctx = context;
 
 	// Read the encoding from META tag, e.g. from
 	//   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">.
@@ -85,16 +89,16 @@ static void _html_get_url(void *context, int flags, const char *dir, const char
 	//
 	// Also ,we are interested in ROBOTS e.g.
 	//   <META name="ROBOTS" content="NOINDEX, NOFOLLOW">
-	if ((flags & XML_FLG_BEGIN) && (*dir|0x20) == 'm' && !strcasecmp(dir, "meta")) {
+	if ((flags & XML_FLG_BEGIN) && (*tag|0x20) == 'm' && !strcasecmp(tag, "meta")) {
 		ctx->found_robots = ctx->found_content_type = 0;
 	}
 
 	if ((flags & XML_FLG_ATTRIBUTE) && val) {
-	MGET_HTML_PARSED_RESULT *res = &ctx->result;
+		MGET_HTML_PARSED_RESULT *res = &ctx->result;
 
 //		info_printf("%02X %s %s '%.*s' %zd %zd\n", flags, dir, attr, (int) len, val, len, pos);
 
-		if ((*dir|0x20) == 'm' && !strcasecmp(dir, "meta")) {
+		if ((*tag|0x20) == 'm' && !strcasecmp(tag, "meta")) {
 			if (!ctx->found_robots) {
 				if (!strcasecmp(attr, "name") && !strncasecmp(val, "robots", len)) {
 					ctx->found_robots = 1;
@@ -145,15 +149,31 @@ static void _html_get_url(void *context, int flags, const char *dir, const char
 			return;
 		}
 
-		// shortcut to avoid unneeded calls to bsearch()
-		if (!maybe[(unsigned char)*attr|0x20] || !attr[1] || !attr[2])
-			return;
+		if (ctx->ignore_tags) {
+			if (mget_vector_find(ctx->ignore_tags, &(mget_html_tag_t){ .name = tag, .attribute = NULL } ) != -1
+				|| mget_vector_find(ctx->ignore_tags, &(mget_html_tag_t){ .name = tag, .attribute = attr } ) != -1)
+				return;
+		}
 
-		if (bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))strcasecmp)) {
+		// shortcut to avoid unneeded calls to bsearch()
+		int found = 0;
+
+		// search the static list for a tag/attr match
+		if (maybe[(unsigned char)*attr|0x20] && attr[1] && attr[2])
+			found = bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))strcasecmp) != NULL;
+
+		// search the dynamic list for a tag/attr match
+		if (!found && ctx->additional_tags) {
+			if (mget_vector_find(ctx->additional_tags, &(mget_html_tag_t){ .name = tag, .attribute = NULL } ) != -1
+				|| mget_vector_find(ctx->additional_tags, &(mget_html_tag_t){ .name = tag, .attribute = attr } ) != -1)
+				found = 1;
+		}
+
+		if (found) {
 			for (;len && isspace(*val); val++, len--); // skip leading spaces
 			for (;len && isspace(val[len - 1]); len--);  // skip trailing spaces
 
-			if ((*dir|0x20) == 'd' && !strcasecmp(dir,"base")) {
+			if ((*tag|0x20) == 'b' && !strcasecmp(tag,"base")) {
 				// found a <BASE href="...">
 				res->base.p = val;
 				res->base.len = len;
@@ -165,7 +185,7 @@ static void _html_get_url(void *context, int flags, const char *dir, const char
 
 			MGET_HTML_PARSED_URL url;
 			strlcpy(url.attr, attr, sizeof(url.attr));
-			strlcpy(url.dir, dir, sizeof(url.dir));
+			strlcpy(url.dir, tag, sizeof(url.dir));
 			url.url.p = val;
 			url.url.len = len;
 			mget_vector_add(res->uris, &url, sizeof(url));
@@ -203,9 +223,13 @@ void mget_html_free_urls_inline (MGET_HTML_PARSED_RESULT **res)
 	}
 }
 
-MGET_HTML_PARSED_RESULT *mget_html_get_urls_inline(const char *html)
+MGET_HTML_PARSED_RESULT *mget_html_get_urls_inline(const char *html, mget_vector_t *additional_tags, mget_vector_t *ignore_tags)
 {
-	_HTML_CONTEXT context = { .result.follow = 1 };
+	_html_context_t context = {
+		.result.follow = 1,
+		.additional_tags = additional_tags,
+		.ignore_tags = ignore_tags
+	};
 
 //	context.result.uris = mget_vector_create(32, -2, NULL);
 	mget_html_parse_buffer(html, _html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT);
diff --git a/src/mget.c b/src/mget.c
index 72f17f53..e01911ed 100644
--- a/src/mget.c
+++ b/src/mget.c
@@ -1007,7 +1007,7 @@ void *downloader_thread(void *p)
 				job->head_first = 1; // enable mime-type check to assure e.g. text/html to be downloaded and parsed
 		}
 
-		info_printf("head_first=%d deferred=%d iri=%s\n", job->head_first, !!job->deferred, job->iri->uri);
+//		info_printf("head_first=%d deferred=%d iri=%s\n", job->head_first, !!job->deferred, job->iri->uri);
 		if ((config.spider || config.chunk_size || job->head_first) && !job->deferred) {
 			// In spider mode, we first make a HEAD request.
 			// If the Content-Type header gives us not a parsable type, we are done.
@@ -1345,7 +1345,7 @@ static unsigned int G_GNUC_MGET_PURE hash_url(const char *url)
 
 void html_parse(JOB *job, int level, const char *html, const char *encoding, mget_iri_t *base)
 {
-	MGET_HTML_PARSED_RESULT *parsed  = mget_html_get_urls_inline(html);
+	MGET_HTML_PARSED_RESULT *parsed  = mget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags);
 	mget_iri_t *allocated_base = NULL;
 	const char *reason;
 	mget_buffer_t buf;
diff --git a/src/options.c b/src/options.c
index cd7d5e37..ae64594c 100644
--- a/src/options.c
+++ b/src/options.c
@@ -160,6 +160,8 @@ static int G_GNUC_MGET_NORETURN print_help(G_GNUC_MGET_UNUSED option_t opt, G_GN
 		"       --robots           Respect robots.txt standard for recursive downloads. (default: on)\n"
 		"       --restrict-file-names  unix, windows, nocontrol, ascii, lowercase, uppercase, none\n"
 		"  -m   --mirror           Turn on mirroring options -r -N -l inf\n"
+		"       --follow-tags      Scan additional tag/attributes for URLs, e.g. --follow-tags=\"img/data-500px,img/data-hires\n"
+		"       --ignore-tags      Ignore tag/attributes for URL scanning, e.g. --ignore-tags=\"img,a/href\n"
 		"\n");
 	puts(
 		"HTTP related options:\n"
@@ -300,7 +302,7 @@ static int parse_stringlist(option_t opt, G_GNUC_MGET_UNUSED const char *const *
 {
 	mget_vector_t *v = *((mget_vector_t **)opt->var);
 
-	if (val) {
+	if (val && *val) {
 		const char *s, *p;
 
 		if (!v)
@@ -319,6 +321,77 @@ static int parse_stringlist(option_t opt, G_GNUC_MGET_UNUSED const char *const *
 	return 0;
 }
 
+static void _free_tag(mget_html_tag_t *tag)
+{
+	if (tag) {
+		xfree(tag->attribute);
+		xfree(tag->name);
+	}
+}
+
+static void G_GNUC_MGET_NONNULL_ALL _add_tag(mget_vector_t *v, const char *begin, const char *end)
+{
+	mget_html_tag_t tag;
+	const char *attribute;
+
+	if ((attribute = memchr(begin, '/', end - begin))) {
+		tag.name = strndup(begin, attribute - begin);
+		tag.attribute = strndup(attribute + 1, (end - begin) - (attribute - begin) - 1);
+	} else {
+		tag.name = strndup(begin, end - begin);
+		tag.attribute = NULL;
+	}
+
+	if (mget_vector_find(v, &tag) == -1)
+		mget_vector_insert_sorted(v, &tag, sizeof(tag));
+	else
+		_free_tag(&tag); // avoid double entries
+}
+
+static int G_GNUC_MGET_NONNULL_ALL _compare_tag(const mget_html_tag_t *t1, const mget_html_tag_t *t2)
+{
+	int n;
+
+	if (!(n = strcasecmp(t1->name, t2->name))) {
+		if (!t1->attribute) {
+			if (!t2->attribute)
+				n = 0;
+			else
+				n = -1;
+		} else if (!t2->attribute) {
+			n = 1;
+		} else
+			n = strcasecmp(t1->attribute, t2->attribute);
+	}
+
+	return n;
+}
+
+static int parse_taglist(option_t opt, G_GNUC_MGET_UNUSED const char *const *argv, const char *val)
+{
+	mget_vector_t *v = *((mget_vector_t **)opt->var);
+
+	if (val && *val) {
+		const char *s, *p;
+
+		if (!v) {
+			v = *((mget_vector_t **)opt->var) = mget_vector_create(8, -2, (int(*)(const void *, const void *))_compare_tag);
+			mget_vector_set_destructor(v, (void(*)(void *))_free_tag);
+		}
+
+		for (s = val; (p = strchr(s, ',')); s = p + 1) {
+			if (p != s)
+				_add_tag(v, s, p);
+		}
+		if (*s)
+			_add_tag(v, s, s + strlen(s));
+	} else {
+		mget_vector_free(&v);
+	}
+
+	return 0;
+}
+
 static int parse_bool(option_t opt, G_GNUC_MGET_UNUSED const char *const *argv, const char *val)
 {
 	if (opt->var) {
@@ -548,6 +621,7 @@ static const struct option options[] = {
 	{ "egd-file", &config.egd_file, parse_string, 1, 0 },
 	{ "exclude-domains", &config.exclude_domains, parse_stringset, 1, 0 },
 	{ "execute", NULL, parse_execute, 1, 'e' },
+	{ "follow-tags", &config.follow_tags, parse_taglist, 1, 0 },
 	{ "force-atom", &config.force_atom, parse_bool, 0, 0 },
 	{ "force-css", &config.force_css, parse_bool, 0, 0 },
 	{ "force-directories", &config.force_directories, parse_bool, 0, 'x' },
@@ -567,6 +641,7 @@ static const struct option options[] = {
 	{ "https-only", &config.https_only, parse_bool, 0, 0 },
 	{ "https-proxy", &config.https_proxy, parse_string, 1, 0 },
 	{ "ignore-case", &config.ignore_case, parse_bool, 0, 0 },
+	{ "ignore-tags", &config.ignore_tags, parse_taglist, 1, 0 },
 	{ "inet4-only", &config.inet4_only, parse_bool, 0, '4' },
 	{ "inet6-only", &config.inet6_only, parse_bool, 0, '6' },
 	{ "input-encoding", &config.input_encoding, parse_string, 1, 0 },
@@ -1278,6 +1353,8 @@ void deinit(void)
 
 	mget_stringmap_free(&config.domains);
 	mget_stringmap_free(&config.exclude_domains);
+	mget_vector_free(&config.follow_tags);
+	mget_vector_free(&config.ignore_tags);
 
 	mget_http_set_http_proxy(NULL, NULL);
 	mget_http_set_https_proxy(NULL, NULL);
diff --git a/src/options.h b/src/options.h
index 44b393c6..53966363 100644
--- a/src/options.h
+++ b/src/options.h
@@ -80,7 +80,9 @@ struct config {
 		*exclude_domains;
 	mget_vector_t
 		*accept_patterns,
-		*reject_patterns;
+		*reject_patterns,
+		*follow_tags,
+		*ignore_tags;
 	mget_hsts_db_t
 		*hsts_db; // in-memory HSTS database
 	size_t
diff --git a/tests/Makefile.am b/tests/Makefile.am
index e84fb72e..ce83eb20 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -9,7 +9,7 @@ MGET_TESTS = test test-mget-1 test-restrict-ascii test-i-http test-i-https test-
  test-meta-robots test-idn-robots test-idn-meta test-idn-cmd \
  test-iri test-iri-percent test-iri-list test-iri-forced-remote \
  test-auth-basic test-parse-html test-parse-rss test--page-requisites test--accept \
- test-k
+ test-k test--follow-tags
 
 #test--post-file test-E-k
 
diff --git a/tests/test--accept.c b/tests/test--accept.c
index 7cfde01a..d659ed1a 100644
--- a/tests/test--accept.c
+++ b/tests/test--accept.c
@@ -93,7 +93,7 @@ int main(void)
 
 	// --accept using just suffixes
 	mget_test(
-		MGET_TEST_OPTIONS, "--num-threads=1 -r -nH --accept '.jpeg'",
+		MGET_TEST_OPTIONS, "-r -nH --accept '.jpeg'",
 		MGET_TEST_REQUEST_URL, "index.html",
 		MGET_TEST_EXPECTED_ERROR_CODE, 0,
 		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
@@ -105,7 +105,7 @@ int main(void)
 
 	// --reject using just suffixes
 	mget_test(
-		MGET_TEST_OPTIONS, "--num-threads=1 -r -nH --reject '.jpeg'",
+		MGET_TEST_OPTIONS, "-r -nH --reject '.jpeg'",
 		MGET_TEST_REQUEST_URL, "index.html",
 		MGET_TEST_EXPECTED_ERROR_CODE, 0,
 		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
diff --git a/tests/test--follow-tags.c b/tests/test--follow-tags.c
new file mode 100644
index 00000000..22e54441
--- /dev/null
+++ b/tests/test--follow-tags.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright(c) 2013 Tim Ruehsen
+ *
+ * This file is part of libmget.
+ *
+ * Libmget is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Libmget is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libmget.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Testing Mget
+ *
+ * Changelog
+ * 09.09.2014  Tim Ruehsen  created
+ *
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdlib.h> // exit()
+#include "libtest.h"
+
+int main(void)
+{
+	mget_test_url_t urls[]={
+		{	.name = "/index.html",
+			.code = "200 Dontcare",
+			.body =
+				"<html><head><title>Main Page</title></head><body><p>A link to a" \
+				" <a href=\"secondpage.html\">second page</a>." \
+				"</p></body></html>",
+			.headers = {
+				"Content-Type: text/html",
+			}
+		},
+		{	.name = "/secondpage.html",
+			.code = "200 Dontcare",
+			.body =
+				"<html><head><title>Second Page</title></head><body><p>A link to a" \
+				" <a href=\"2a.jpeg\">Picture 2a</a>." \
+				" <img src=\"2b.jpeg\" data-500px=\"2c.jpeg\" data-highres=\"2d.jpeg\">" \
+				"</p></body></html>",
+			.headers = {
+				"Content-Type: text/html",
+			}
+		},
+		{	.name = "/2a.jpeg",
+			.code = "200 Dontcare",
+			.body = "pic 2a",
+			.headers = { "Content-Type: image/jpeg" }
+		},
+		{	.name = "/2b.jpeg",
+			.code = "200 Dontcare",
+			.body = "pic 2b",
+			.headers = { "Content-Type: image/jpeg" }
+		},
+		{	.name = "/2c.jpeg",
+			.code = "200 Dontcare",
+			.body = "pic 2c",
+			.headers = { "Content-Type: image/jpeg" }
+		},
+		{	.name = "/2d.jpeg",
+			.code = "200 Dontcare",
+			.body = "pic 2c",
+			.headers = { "Content-Type: image/jpeg" }
+		},
+	};
+
+	// functions won't come back if an error occurs
+	mget_test_start_http_server(
+		MGET_TEST_RESPONSE_URLS, &urls, countof(urls),
+		0);
+
+	// without additional tags
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH ",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{ urls[3].name + 1, urls[3].body },
+			{	NULL } },
+		0);
+
+	// --follow-tags single entry
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img/data-500px'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{ urls[3].name + 1, urls[3].body },
+			{ urls[4].name + 1, urls[4].body },
+			{	NULL } },
+		0);
+
+	// --follow-tags single entry without attribute
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{ urls[3].name + 1, urls[3].body },
+			{ urls[4].name + 1, urls[4].body },
+			{ urls[5].name + 1, urls[5].body },
+			{	NULL } },
+		0);
+
+	// --follow-tags two entries
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img/data-500px,img/data-highres'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{ urls[3].name + 1, urls[3].body },
+			{ urls[4].name + 1, urls[4].body },
+			{ urls[5].name + 1, urls[5].body },
+			{	NULL } },
+		0);
+
+	// --follow-tags two entries
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --follow-tags 'img/data-highres,img/data-500px'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{ urls[3].name + 1, urls[3].body },
+			{ urls[4].name + 1, urls[4].body },
+			{ urls[5].name + 1, urls[5].body },
+			{	NULL } },
+		0);
+
+	// --ignore-tags single entry
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img/src'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{	NULL } },
+		0);
+
+	// --ignore-tags single entry without attribute
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{	NULL } },
+		0);
+
+	// --ignore-tags two entries
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img/src,a/href'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{	NULL } },
+		0);
+
+	// --ignore-tags two entries
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'a/href,img/src'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{	NULL } },
+		0);
+
+	// --ignore-tags and --follow-tags combined
+	mget_test(
+		MGET_TEST_OPTIONS, "-r -nH --ignore-tags 'img/src' --follow-tags='img/data-500px'",
+		MGET_TEST_REQUEST_URL, "index.html",
+		MGET_TEST_EXPECTED_ERROR_CODE, 0,
+		MGET_TEST_EXPECTED_FILES, &(mget_test_file_t []) {
+			{ urls[0].name + 1, urls[0].body },
+			{ urls[1].name + 1, urls[1].body },
+			{ urls[2].name + 1, urls[2].body },
+			{ urls[4].name + 1, urls[4].body },
+			{	NULL } },
+		0);
+
+	exit(0);
+}