wget2/libwget/html_url.c

/*
 * Copyright(c) 2013 Tim Ruehsen
 * Copyright(c) 2015-2018 Free Software Foundation, Inc.
 *
 * This file is part of libwget.
 *
 * Libwget is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Libwget is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * Extracting URLs from HTML
 *
 * Changelog
 * 26.09.2013  Tim Ruehsen  created
 *
 */

#include <config.h>

#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <c-ctype.h>

#include <wget.h>
#include "private.h"

typedef struct {
	wget_html_parsed_result_t
		result;
	wget_vector_t *
		additional_tags;
	wget_vector_t *
		ignore_tags;
	int
		uri_index;
	size_t
		css_start_offset;
	char
		found_robots,
		found_content_type,
		link_inline;
	const char
		* html,
		* css_attr,
		* css_dir;
} _html_context_t;

// see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
static const char maybe[256] = {
	['a'] = 1,
	['b'] = 1,
	['c'] = 1,
	['d'] = 1,
	['f'] = 1,
	['h'] = 1,
	['i'] = 1,
	['l'] = 1,
	['m'] = 1,
	['p'] = 1,
	['s'] = 1,
	['u'] = 1,
};
static const char attrs[][12] = {
	"action", "archive",
	"background",
	"code", "codebase", "cite", "classid",
	"data",
	"formaction",
	"href",
	"icon",
	"lowsrc", "longdesc",
	"manifest",
	"profile", "poster",
	"src", "srcset",
	"usemap"
};

static void _css_parse_uri(void *context, const char *url G_GNUC_WGET_UNUSED, size_t len, size_t pos)
{
	_html_context_t *ctx = context;

	wget_html_parsed_result_t *res = &ctx->result;

	if (!res->uris)
		res->uris = wget_vector_create(32, NULL);

	wget_html_parsed_url_t parsed_url;
	parsed_url.link_inline = 1;
	wget_strscpy(parsed_url.attr, ctx->css_attr, sizeof(parsed_url.attr));
	wget_strscpy(parsed_url.dir, ctx->css_dir, sizeof(parsed_url.dir));
	parsed_url.url.p = (const char *) (ctx->html + ctx->css_start_offset + pos);
	parsed_url.url.len = len;

	wget_vector_add(res->uris, &parsed_url, sizeof(parsed_url));
}

// Callback function, called from HTML parser for each URI found.
static void _html_get_url(void *context, int flags, const char *tag, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED)
{
	_html_context_t *ctx = context;

	// Read the encoding from META tag, e.g. from
	//   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">.
	// It overrides the encoding from the HTTP response resp. from the CLI.
	//
	// Also ,we are interested in ROBOTS e.g.
	//   <META name="ROBOTS" content="NOINDEX, NOFOLLOW">
	if ((flags & XML_FLG_BEGIN)) {
		if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta"))
			ctx->found_robots = ctx->found_content_type = 0;
		else if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
			ctx->link_inline = 0;
			ctx->uri_index = -1;
		}
	}

	if ((flags & XML_FLG_ATTRIBUTE) && val) {
		wget_html_parsed_result_t *res = &ctx->result;

//		debug_printf("%02X %s %s '%.*s' %zu %zu\n", (unsigned) flags, tag, attr, (int) len, val, len, pos);

		if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) {
			if (!ctx->found_robots) {
				if (!wget_strcasecmp_ascii(attr, "name") && !wget_strncasecmp_ascii(val, "robots", len)) {
					ctx->found_robots = 1;
					return;
				}
			} else if (ctx->found_robots && !wget_strcasecmp_ascii(attr, "content")) {
				char *p;
				char valbuf[len + 1], *value = valbuf;

				memcpy(value, val, len);
				value[len] = 0;

				while (*value) {
					while (c_isspace(*value)) value++;
					if (*value == ',') { value++; continue; }
					for (p = value; *p && !c_isspace(*p) && *p != ','; p++);
					if (p == value) break;

					// debug_printf("ROBOTS='%.*s'\n", (int)(p - value), value);
					if (!wget_strncasecmp_ascii(value, "all", p - value) || !wget_strncasecmp_ascii(value, "follow", p - value))
						res->follow = 1;
					else if (!wget_strncasecmp_ascii(value, "nofollow", p - value) || !wget_strncasecmp_ascii(value, "none", p - value))
						res->follow = 0;

					value = *p  ? p + 1 : p;
				}
				return;
			}

			if (ctx->found_content_type && !res->encoding) {
				if (!wget_strcasecmp_ascii(attr, "content")) {
					char valbuf[len + 1], *value = valbuf;

					memcpy(value, val, len);
					value[len] = 0;
					wget_http_parse_content_type(value, NULL, &res->encoding);
				}
			}
			else if (!ctx->found_content_type && !res->encoding) {
				if (!wget_strcasecmp_ascii(attr, "http-equiv") && !wget_strncasecmp_ascii(val, "Content-Type", len)) {
					ctx->found_content_type = 1;
				}
				else if (!wget_strcasecmp_ascii(attr, "charset")) {
					res->encoding = wget_strmemdup(val, len);
				}
			}

			return;
		}

		if (ctx->ignore_tags) {
			if (wget_vector_find(ctx->ignore_tags, &(wget_html_tag_t){ .name = tag, .attribute = NULL } ) != -1
				|| wget_vector_find(ctx->ignore_tags, &(wget_html_tag_t){ .name = tag, .attribute = attr } ) != -1)
				return;
		}

		if ((*attr|0x20) == 's' && !wget_strcasecmp_ascii(attr, "style") && len) {
			ctx->css_dir = tag;
			ctx->css_attr = "style";
			ctx->css_start_offset = val - ctx->html;
			wget_css_parse_buffer(val, len, _css_parse_uri, NULL, context);
			return;
		}

		if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
			if (!wget_strcasecmp_ascii(attr, "rel")) {
				if (!wget_strncasecmp_ascii(val, "shortcut icon", len)
					|| !wget_strncasecmp_ascii(val, "stylesheet", len)
					|| !wget_strncasecmp_ascii(val, "preload", len))
					ctx->link_inline = 1;
				else
					ctx->link_inline = 0;

				if (ctx->uri_index >= 0) {
					// href= came before rel=
					wget_html_parsed_url_t *url = wget_vector_get(res->uris, ctx->uri_index);
					url->link_inline = ctx->link_inline;
				}
			}
		}

		// shortcut to avoid unneeded calls to bsearch()
		int found = 0;

		// search the static list for a tag/attr match
		if (maybe[(unsigned char)*attr|0x20] && attr[1] && attr[2])
			found = bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))wget_strcasecmp_ascii) != NULL;

		// search the dynamic list for a tag/attr match
		if (!found && ctx->additional_tags) {
			if (wget_vector_find(ctx->additional_tags, &(wget_html_tag_t){ .name = tag, .attribute = NULL } ) != -1
				|| wget_vector_find(ctx->additional_tags, &(wget_html_tag_t){ .name = tag, .attribute = attr } ) != -1)
				found = 1;
		}

		if (found) {
			for (;len && c_isspace(*val); val++, len--); // skip leading spaces
			for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces

			if ((*tag|0x20) == 'b' && !wget_strcasecmp_ascii(tag, "base")) {
				// found a <BASE href="...">
				res->base.p = val;
				res->base.len = len;
				return;
			}

			if (!res->uris)
				res->uris = wget_vector_create(32, NULL);

			wget_html_parsed_url_t url;

			if (!wget_strcasecmp_ascii(attr, "srcset")) {
				// value is a list of URLs, see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
				while (len) {
					const char *p;

					for (;len && c_isspace(*val); val++, len--); // skip leading spaces
					for (p = val;len && !c_isspace(*val) && *val != ','; val++, len--); // find end of URL
					if (p != val) {
						wget_strscpy(url.attr, attr, sizeof(url.attr));
						wget_strscpy(url.dir, tag, sizeof(url.dir));
						url.url.p = p;
						url.url.len = val - p;
						wget_vector_add(res->uris, &url, sizeof(url));
					}
					for (;len && *val != ','; val++, len--); // skip optional width/density descriptor
					if (len && *val == ',') { val++; len--; }
				}

			} else {
				// value is a single URL
				url.link_inline = ctx->link_inline;
				wget_strscpy(url.attr, attr, sizeof(url.attr));
				wget_strscpy(url.dir, tag, sizeof(url.dir));
				url.url.p = val;
				url.url.len = len;
				ctx->uri_index = wget_vector_add(res->uris, &url, sizeof(url));
			}
		}
	}

	if (flags & XML_FLG_CONTENT && val && len && !wget_strcasecmp_ascii(tag, "style")) {
		ctx->css_dir = "style";
		ctx->css_attr = "";
		ctx->css_start_offset = val - ctx->html;
		wget_css_parse_buffer(val, len, _css_parse_uri, NULL, context);
	}
}

void wget_html_free_urls_inline (wget_html_parsed_result_t **res)
{
	if (res && *res) {
		xfree((*res)->encoding);
		wget_vector_free(&(*res)->uris);
		xfree(*res);
	}
}

wget_html_parsed_result_t *wget_html_get_urls_inline(const char *html, wget_vector_t *additional_tags, wget_vector_t *ignore_tags)
{
	_html_context_t context = {
		.result.follow = 1,
		.additional_tags = additional_tags,
		.ignore_tags = ignore_tags,
		.html = html,
	};

//	context.result.uris = wget_vector_create(32, -2, NULL);
	wget_html_parse_buffer(html, _html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT);

	return wget_memdup(&context.result, sizeof(context.result));
}