Support parsing of UTF-16 HTML documents

* examples/print_html_urls.c (html_parse_localfile): Detect BOM (Byte Order Mark), convert UTF-16 to UTF-8 before parsing. * include/libwget.h.in: Remove wget_charset_transcode(), add wget_memiconv() and wget_striconv(). * libwget/encoding.c: Implement wget_memiconv() and wget_striconv() * src/wget.c (html_parse): Add 'html_len' param, convert UTF-16 to UTF-8 before parsing. * tests/test.c: New test for wget_memiconv().
2025-08-16 17:37:56 +00:00 · 2016-04-18 21:47:43 +02:00
parent 51053c2b4e
commit 483c304aef
5 changed files with 174 additions and 43 deletions
--- a/examples/print_html_urls.c
+++ b/examples/print_html_urls.c
@ -38,12 +38,57 @@
 static void html_parse_localfile(const char *fname)
 {
 	char *data;
+	const char *encoding = NULL;
+	size_t len;
+
+	if ((data = wget_read_file(fname, &len))) {
+		if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) {
+			// Big-endian UTF-16
+			encoding = "UTF-16BE";
+
+			// adjust behind BOM, ignore trailing single byte
+			data += 2;
+			len -= 2;
+		} else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) {
+			// Little-endian UTF-16
+			encoding = "UTF-16LE";
+
+			// adjust behind BOM
+			data += 2;
+			len -= 2;
+		} else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) {
+			// UTF-8
+			encoding = "UTF-8";
+
+			// adjust behind BOM
+			data += 3;
+			len -= 3;
+		}
+
+		if (encoding)
+			printf("URI encoding '%s' set by BOM\n", encoding);
+
+		if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
+			size_t n;
+			char *utf8;
+
+			len -= len & 1; // ignore single trailing byte, else charset conversion fails
+
+			if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) {
+				printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding);
+				data = utf8;
+			} else {
+				printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding);
+				return;
+			}
+		}

-	if ((data = wget_read_file(fname, NULL))) {
 		WGET_HTML_PARSED_RESULT *res  = wget_html_get_urls_inline(data, NULL, NULL);

-		if (res->encoding)
-			printf("URI encoding '%s'\n", res->encoding);
+		if (encoding) {
+			if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding))
+				printf("Encoding '%s' as stated in document has been ignored\n", encoding);
+		}

 		for (int it = 0; it < wget_vector_size(res->uris); it++) {
 			WGET_HTML_PARSED_URL *html_url = wget_vector_get(res->uris, it);
--- a/include/libwget.h.in
+++ b/include/libwget.h.in
@ -330,8 +330,10 @@ int
 		int (*load_func)(void *, FILE *fp), int (*save_func)(void *, FILE *fp), void *context) LIBWGET_EXPORT;
 const char
 	*wget_local_charset_encoding(void) LIBWGET_EXPORT;
+int
+	wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen) LIBWGET_EXPORT;
 char *
-	wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding) G_GNUC_WGET_MALLOC LIBWGET_EXPORT;
+	wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding) G_GNUC_WGET_MALLOC LIBWGET_EXPORT;
 int
 	wget_str_needs_encoding(const char *s) G_GNUC_WGET_NONNULL((1)) G_GNUC_WGET_PURE LIBWGET_EXPORT;
 int
--- a/libwget/encoding.c
+++ b/libwget/encoding.c
@ -66,10 +66,11 @@ const char *wget_local_charset_encoding(void)
 	return strdup("ASCII");
 }

-char *wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding)
+// void *wget_memiconv(const void *src, size_t length, const char *src_encoding, const char *dst_encoding)
+int wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen)
 {
 	if (!src)
-		return NULL;
+		return -1;

 #ifdef HAVE_ICONV
 	if (!src_encoding)
@ -77,35 +78,64 @@ char *wget_charset_transcode(const char *src, const char *src_encoding, const ch
 	if (!dst_encoding)
 		dst_encoding = "iso-8859-1"; // default character-set for most browsers

-	if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
-		char *ret = NULL;
+	int ret = -1;

-		iconv_t cd=iconv_open(dst_encoding, src_encoding);
+	if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
+		iconv_t cd = iconv_open(dst_encoding, src_encoding);

 		if (cd != (iconv_t)-1) {
 			char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself
-			size_t tmp_len = strlen(src);
+			size_t tmp_len = srclen;
 			size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
 			char *dst = xmalloc(dst_len + 1), *dst_tmp = dst;

 			if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
 				&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
 			{
-				ret = wget_strmemdup(dst, dst_len - dst_len_tmp);
-				debug_printf("converted '%s' (%s) -> '%s' (%s)\n", src, src_encoding, ret, dst_encoding);
-			} else
-				error_printf(_("Failed to convert '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
+				debug_printf("transcoded %zu bytes from '%s' to '%s'\n", srclen, src_encoding, dst_encoding);
+				if (out) {
+					*out = xrealloc(dst, dst_len - dst_len_tmp + 1);
+					(*out)[dst_len - dst_len_tmp] = 0;
+				}
+				if (outlen)
+					*outlen = dst_len - dst_len_tmp;
+				ret = 0; // return OK
+			} else {
+				error_printf(_("Failed to transcode '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
+				xfree(dst);
+				if (out)
+					*out = NULL;
+				if (outlen)
+					*outlen = 0;
+			}

-			xfree(dst);
 			iconv_close(cd);
 		} else
-			error_printf(_("Failed to prepare encoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
+			error_printf(_("Failed to prepare transcoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);

 		return ret;
 	}
 #endif

-	return strdup(src);
+	if (out)
+		*out = wget_strmemdup(src, srclen);
+	if (outlen)
+		*outlen = srclen;
+
+	return 0;
+}
+
+// src must be a ASCII compatible C string
+char *wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding)
+{
+	if (!src)
+		return NULL;
+
+	char *dst;
+	if (wget_memiconv(src_encoding, src, strlen(src), dst_encoding, &dst, NULL))
+		return NULL;
+
+	return dst;
 }

 int wget_str_needs_encoding(const char *s)
@ -143,12 +173,12 @@ int wget_str_is_valid_utf8(const char *utf8)

 char *wget_str_to_utf8(const char *src, const char *encoding)
 {
-	return wget_charset_transcode(src, encoding, "utf-8");
+	return wget_striconv(src, encoding, "utf-8");
 }

 char *wget_utf8_to_str(const char *src, const char *encoding)
 {
-	return wget_charset_transcode(src, "utf-8", encoding);
+	return wget_striconv(src, "utf-8", encoding);
 }

 #ifdef WITH_LIBIDN
--- a/src/wget.c
+++ b/src/wget.c
@ -127,7 +127,7 @@ static void
 	rss_parse(JOB *job, const char *data, const char *encoding, wget_iri_t *base),
 	rss_parse_localfile(JOB *job, const char *fname, const char *encoding, wget_iri_t *base),
 	metalink_parse_localfile(const char *fname),
-	html_parse(JOB *job, int level, const char *data, const char *encoding, wget_iri_t *base),
+	html_parse(JOB *job, int level, const char *data, size_t len, const char *encoding, wget_iri_t *base),
 	html_parse_localfile(JOB *job, int level, const char *fname, const char *encoding, wget_iri_t *base),
 	css_parse(JOB *job, const char *data, const char *encoding, wget_iri_t *base),
 	css_parse_localfile(JOB *job, const char *fname, const char *encoding, wget_iri_t *base);
@ -1383,9 +1383,9 @@ void *downloader_thread(void *p)
 			if (config.recursive && (!config.level || job->level < config.level + config.page_requisites)) {
 				if (resp->content_type) {
 					if (!wget_strcasecmp_ascii(resp->content_type, "text/html")) {
-						html_parse(job, job->level, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
+						html_parse(job, job->level, resp->body->data, resp->body->length, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
 					} else if (!wget_strcasecmp_ascii(resp->content_type, "application/xhtml+xml")) {
-						html_parse(job, job->level, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
+						html_parse(job, job->level, resp->body->data, resp->body->length, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
 						// xml_parse(sockfd, resp, job->iri);
 					} else if (!wget_strcasecmp_ascii(resp->content_type, "text/css")) {
 						css_parse(job, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
@ -1500,16 +1500,14 @@ static unsigned int G_GNUC_WGET_PURE hash_url(const char *url)
 	return hash;
 }

-void html_parse(JOB *job, int level, const char *html, const char *encoding, wget_iri_t *base)
+void html_parse(JOB *job, int level, const char *html, size_t html_len, const char *encoding, wget_iri_t *base)
 {
-	WGET_HTML_PARSED_RESULT *parsed  = wget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags);
 	wget_iri_t *allocated_base = NULL;
 	const char *reason;
+	char *utf8 = NULL;
 	wget_buffer_t buf;
 	char sbuf[1024];
-
-	if (config.robots && !parsed->follow)
-		goto cleanup;
+	int convert_links = config.convert_links && !config.delete_after;

 	// http://www.whatwg.org/specs/web-apps/current-work/, 12.2.2.2
 	if (encoding && encoding == config.remote_encoding) {
@ -1519,33 +1517,62 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge
 			// Big-endian UTF-16
 			encoding = "UTF-16BE";
 			reason = _("set by BOM");
+
+			// adjust behind BOM, ignore trailing single byte
+			html += 2;
+			html_len -= 2;
 		} else if ((unsigned char)html[0] == 0xFF && (unsigned char)html[1] == 0xFE) {
 			// Little-endian UTF-16
 			encoding = "UTF-16LE";
 			reason = _("set by BOM");
+
+			// adjust behind BOM
+			html += 2;
+			html_len -= 2;
 		} else if ((unsigned char)html[0] == 0xEF && (unsigned char)html[1] == 0xBB && (unsigned char)html[2] == 0xBF) {
 			// UTF-8
 			encoding = "UTF-8";
 			reason = _("set by BOM");
+
+			// adjust behind BOM
+			html += 3;
+			html_len -= 3;
 		} else {
 			reason = _("set by server response");
 		}
+	}

-		if (!wget_strncasecmp(parsed->encoding, "UTF-16", 6) || !wget_strncasecmp(encoding, "UTF-16", 6)) {
-			// http://www.whatwg.org/specs/web-apps/current-work/, 12.2.2.2
-			// we found an encoding in the HTML, so it can't be UTF-16*.
-			encoding = "UTF-8";
-			reason = _("wrong stated UTF-16* changed to UTF-8");
-		}
+	if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
+		size_t n;

-		if (!encoding) {
-			if (parsed->encoding) {
-				encoding = parsed->encoding;
-				reason = _("set by document");
-			} else {
-				encoding = "CP1252"; // default encoding for HTML5 (pre-HTML5 is iso-8859-1)
-				reason = _("default, encoding not specified");
+		html_len -= html_len & 1; // ignore single trailing byte, else charset conversion fails
+
+		if (wget_memiconv(encoding, html, html_len, "UTF-8", &utf8, &n) == 0) {
+			info_printf(_("Convert non-ASCII encoding '%s' (%s) to UTF-8\n"), encoding, reason);
+			html = utf8;
+			if (convert_links) {
+				convert_links = 0; // prevent link conversion
+				info_printf(_("Link conversion disabled for '%s'\n"), job->local_filename);
 			}
+
+		} else {
+			info_printf(_("Failed to convert non-ASCII encoding '%s' (%s) to UTF-8, skip parsing\n"), encoding, reason);
+			return;
+		}
+	}
+
+	WGET_HTML_PARSED_RESULT *parsed  = wget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags);
+
+	if (config.robots && !parsed->follow)
+		goto cleanup;
+
+	if (!encoding) {
+		if (parsed->encoding) {
+			encoding = parsed->encoding;
+			reason = _("set by document");
+		} else {
+			encoding = "CP1252"; // default encoding for HTML5 (pre-HTML5 is iso-8859-1)
+			reason = _("default, encoding not specified");
 		}
 	}

@ -1609,7 +1636,7 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge

 	wget_buffer_deinit(&buf);

-	if (config.convert_links && !config.delete_after) {
+	if (convert_links && !config.delete_after) {
 		for (int it = 0; it < wget_vector_size(parsed->uris); it++) {
 			WGET_HTML_PARSED_URL *html_url = wget_vector_get(parsed->uris, it);
 			html_url->url.p = (const char *) (html_url->url.p - html); // convert pointer to offset
@ -1622,14 +1649,17 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge

 cleanup:
 	wget_html_free_urls_inline(&parsed);
+	xfree(utf8);
 }

 void html_parse_localfile(JOB *job, int level, const char *fname, const char *encoding, wget_iri_t *base)
 {
 	char *data;
+	size_t n;

-	if ((data = wget_read_file(fname, NULL)))
-		html_parse(job, level, data, encoding, base);
+	if ((data = wget_read_file(fname, &n))) {
+		html_parse(job, level, data, n, encoding, base);
+	}

 	xfree(data);
 }
--- a/tests/test.c
+++ b/tests/test.c
@ -1497,6 +1497,29 @@ static void test_stringmap(void)

 }

+static void test_striconv(void)
+{
+	const char *utf8 = "abcßüäö";
+	char *utf16be, *utf16le, *result;
+	size_t n;
+
+	// convert utf-8 to utf-16be
+	if (wget_memiconv("utf-8", utf8, strlen(utf8), "UTF-16BE", &utf16be, &n) ||
+		wget_memiconv("UTF-16BE", utf16be, n, "UTF-16LE", &utf16le, &n) ||
+		wget_memiconv("UTF-16LE", utf16le, n, "UTF-8", &result, &n) ||
+		strcmp(utf8, result))
+	{
+		info_printf("Character conversion of '%s' failed (got '%s')\n", utf8, result);
+		failed++;
+	} else {
+		ok++;
+	}
+
+	xfree(result);
+	xfree(utf16le);
+	xfree(utf16be);
+}
+
 int main(int argc, const char **argv)
 {
 	// if VALGRIND testing is enabled, we have to call ourselves with valgrind checking
@ -1530,6 +1553,7 @@ int main(int argc, const char **argv)
 	test_hashing();
 	test_vector();
 	test_stringmap();
+	test_striconv();

 	if (failed) {
 		info_printf("ERROR: %d out of %d basic tests failed\n", failed, ok + failed);