Support parsing of UTF-16 HTML documents

* examples/print_html_urls.c (html_parse_localfile):
  Detect BOM (Byte Order Mark), convert UTF-16 to UTF-8 before parsing.
* include/libwget.h.in: Remove wget_charset_transcode(),
  add wget_memiconv() and wget_striconv().
* libwget/encoding.c: Implement wget_memiconv() and wget_striconv()
* src/wget.c (html_parse): Add 'html_len' param,
  convert UTF-16 to UTF-8 before parsing.
* tests/test.c: New test for wget_memiconv().
This commit is contained in:
Tim Rühsen
2016-04-18 21:47:43 +02:00
parent 51053c2b4e
commit 483c304aef
5 changed files with 174 additions and 43 deletions

View File

@ -38,12 +38,57 @@
static void html_parse_localfile(const char *fname)
{
char *data;
const char *encoding = NULL;
size_t len;
if ((data = wget_read_file(fname, &len))) {
if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) {
// Big-endian UTF-16
encoding = "UTF-16BE";
// adjust behind BOM, ignore trailing single byte
data += 2;
len -= 2;
} else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) {
// Little-endian UTF-16
encoding = "UTF-16LE";
// adjust behind BOM
data += 2;
len -= 2;
} else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) {
// UTF-8
encoding = "UTF-8";
// adjust behind BOM
data += 3;
len -= 3;
}
if (encoding)
printf("URI encoding '%s' set by BOM\n", encoding);
if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
size_t n;
char *utf8;
len -= len & 1; // ignore single trailing byte, else charset conversion fails
if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) {
printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding);
data = utf8;
} else {
printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding);
return;
}
}
if ((data = wget_read_file(fname, NULL))) {
WGET_HTML_PARSED_RESULT *res = wget_html_get_urls_inline(data, NULL, NULL);
if (res->encoding)
printf("URI encoding '%s'\n", res->encoding);
if (encoding) {
if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding))
printf("Encoding '%s' as stated in document has been ignored\n", encoding);
}
for (int it = 0; it < wget_vector_size(res->uris); it++) {
WGET_HTML_PARSED_URL *html_url = wget_vector_get(res->uris, it);

View File

@ -330,8 +330,10 @@ int
int (*load_func)(void *, FILE *fp), int (*save_func)(void *, FILE *fp), void *context) LIBWGET_EXPORT;
const char
*wget_local_charset_encoding(void) LIBWGET_EXPORT;
int
wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen) LIBWGET_EXPORT;
char *
wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding) G_GNUC_WGET_MALLOC LIBWGET_EXPORT;
wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding) G_GNUC_WGET_MALLOC LIBWGET_EXPORT;
int
wget_str_needs_encoding(const char *s) G_GNUC_WGET_NONNULL((1)) G_GNUC_WGET_PURE LIBWGET_EXPORT;
int

View File

@ -66,10 +66,11 @@ const char *wget_local_charset_encoding(void)
return strdup("ASCII");
}
char *wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding)
// void *wget_memiconv(const void *src, size_t length, const char *src_encoding, const char *dst_encoding)
int wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen)
{
if (!src)
return NULL;
return -1;
#ifdef HAVE_ICONV
if (!src_encoding)
@ -77,35 +78,64 @@ char *wget_charset_transcode(const char *src, const char *src_encoding, const ch
if (!dst_encoding)
dst_encoding = "iso-8859-1"; // default character-set for most browsers
if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
char *ret = NULL;
int ret = -1;
iconv_t cd=iconv_open(dst_encoding, src_encoding);
if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
iconv_t cd = iconv_open(dst_encoding, src_encoding);
if (cd != (iconv_t)-1) {
char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself
size_t tmp_len = strlen(src);
size_t tmp_len = srclen;
size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
char *dst = xmalloc(dst_len + 1), *dst_tmp = dst;
if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
{
ret = wget_strmemdup(dst, dst_len - dst_len_tmp);
debug_printf("converted '%s' (%s) -> '%s' (%s)\n", src, src_encoding, ret, dst_encoding);
} else
error_printf(_("Failed to convert '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
debug_printf("transcoded %zu bytes from '%s' to '%s'\n", srclen, src_encoding, dst_encoding);
if (out) {
*out = xrealloc(dst, dst_len - dst_len_tmp + 1);
(*out)[dst_len - dst_len_tmp] = 0;
}
if (outlen)
*outlen = dst_len - dst_len_tmp;
ret = 0; // return OK
} else {
error_printf(_("Failed to transcode '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
xfree(dst);
if (out)
*out = NULL;
if (outlen)
*outlen = 0;
}
xfree(dst);
iconv_close(cd);
} else
error_printf(_("Failed to prepare encoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
error_printf(_("Failed to prepare transcoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
return ret;
}
#endif
return strdup(src);
if (out)
*out = wget_strmemdup(src, srclen);
if (outlen)
*outlen = srclen;
return 0;
}
// src must be a ASCII compatible C string
char *wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding)
{
if (!src)
return NULL;
char *dst;
if (wget_memiconv(src_encoding, src, strlen(src), dst_encoding, &dst, NULL))
return NULL;
return dst;
}
int wget_str_needs_encoding(const char *s)
@ -143,12 +173,12 @@ int wget_str_is_valid_utf8(const char *utf8)
char *wget_str_to_utf8(const char *src, const char *encoding)
{
return wget_charset_transcode(src, encoding, "utf-8");
return wget_striconv(src, encoding, "utf-8");
}
char *wget_utf8_to_str(const char *src, const char *encoding)
{
return wget_charset_transcode(src, "utf-8", encoding);
return wget_striconv(src, "utf-8", encoding);
}
#ifdef WITH_LIBIDN

View File

@ -127,7 +127,7 @@ static void
rss_parse(JOB *job, const char *data, const char *encoding, wget_iri_t *base),
rss_parse_localfile(JOB *job, const char *fname, const char *encoding, wget_iri_t *base),
metalink_parse_localfile(const char *fname),
html_parse(JOB *job, int level, const char *data, const char *encoding, wget_iri_t *base),
html_parse(JOB *job, int level, const char *data, size_t len, const char *encoding, wget_iri_t *base),
html_parse_localfile(JOB *job, int level, const char *fname, const char *encoding, wget_iri_t *base),
css_parse(JOB *job, const char *data, const char *encoding, wget_iri_t *base),
css_parse_localfile(JOB *job, const char *fname, const char *encoding, wget_iri_t *base);
@ -1383,9 +1383,9 @@ void *downloader_thread(void *p)
if (config.recursive && (!config.level || job->level < config.level + config.page_requisites)) {
if (resp->content_type) {
if (!wget_strcasecmp_ascii(resp->content_type, "text/html")) {
html_parse(job, job->level, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
html_parse(job, job->level, resp->body->data, resp->body->length, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
} else if (!wget_strcasecmp_ascii(resp->content_type, "application/xhtml+xml")) {
html_parse(job, job->level, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
html_parse(job, job->level, resp->body->data, resp->body->length, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
// xml_parse(sockfd, resp, job->iri);
} else if (!wget_strcasecmp_ascii(resp->content_type, "text/css")) {
css_parse(job, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
@ -1500,16 +1500,14 @@ static unsigned int G_GNUC_WGET_PURE hash_url(const char *url)
return hash;
}
void html_parse(JOB *job, int level, const char *html, const char *encoding, wget_iri_t *base)
void html_parse(JOB *job, int level, const char *html, size_t html_len, const char *encoding, wget_iri_t *base)
{
WGET_HTML_PARSED_RESULT *parsed = wget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags);
wget_iri_t *allocated_base = NULL;
const char *reason;
char *utf8 = NULL;
wget_buffer_t buf;
char sbuf[1024];
if (config.robots && !parsed->follow)
goto cleanup;
int convert_links = config.convert_links && !config.delete_after;
// http://www.whatwg.org/specs/web-apps/current-work/, 12.2.2.2
if (encoding && encoding == config.remote_encoding) {
@ -1519,33 +1517,62 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge
// Big-endian UTF-16
encoding = "UTF-16BE";
reason = _("set by BOM");
// adjust behind BOM, ignore trailing single byte
html += 2;
html_len -= 2;
} else if ((unsigned char)html[0] == 0xFF && (unsigned char)html[1] == 0xFE) {
// Little-endian UTF-16
encoding = "UTF-16LE";
reason = _("set by BOM");
// adjust behind BOM
html += 2;
html_len -= 2;
} else if ((unsigned char)html[0] == 0xEF && (unsigned char)html[1] == 0xBB && (unsigned char)html[2] == 0xBF) {
// UTF-8
encoding = "UTF-8";
reason = _("set by BOM");
// adjust behind BOM
html += 3;
html_len -= 3;
} else {
reason = _("set by server response");
}
}
if (!wget_strncasecmp(parsed->encoding, "UTF-16", 6) || !wget_strncasecmp(encoding, "UTF-16", 6)) {
// http://www.whatwg.org/specs/web-apps/current-work/, 12.2.2.2
// we found an encoding in the HTML, so it can't be UTF-16*.
encoding = "UTF-8";
reason = _("wrong stated UTF-16* changed to UTF-8");
}
if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
size_t n;
if (!encoding) {
if (parsed->encoding) {
encoding = parsed->encoding;
reason = _("set by document");
} else {
encoding = "CP1252"; // default encoding for HTML5 (pre-HTML5 is iso-8859-1)
reason = _("default, encoding not specified");
html_len -= html_len & 1; // ignore single trailing byte, else charset conversion fails
if (wget_memiconv(encoding, html, html_len, "UTF-8", &utf8, &n) == 0) {
info_printf(_("Convert non-ASCII encoding '%s' (%s) to UTF-8\n"), encoding, reason);
html = utf8;
if (convert_links) {
convert_links = 0; // prevent link conversion
info_printf(_("Link conversion disabled for '%s'\n"), job->local_filename);
}
} else {
info_printf(_("Failed to convert non-ASCII encoding '%s' (%s) to UTF-8, skip parsing\n"), encoding, reason);
return;
}
}
WGET_HTML_PARSED_RESULT *parsed = wget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags);
if (config.robots && !parsed->follow)
goto cleanup;
if (!encoding) {
if (parsed->encoding) {
encoding = parsed->encoding;
reason = _("set by document");
} else {
encoding = "CP1252"; // default encoding for HTML5 (pre-HTML5 is iso-8859-1)
reason = _("default, encoding not specified");
}
}
@ -1609,7 +1636,7 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge
wget_buffer_deinit(&buf);
if (config.convert_links && !config.delete_after) {
if (convert_links && !config.delete_after) {
for (int it = 0; it < wget_vector_size(parsed->uris); it++) {
WGET_HTML_PARSED_URL *html_url = wget_vector_get(parsed->uris, it);
html_url->url.p = (const char *) (html_url->url.p - html); // convert pointer to offset
@ -1622,14 +1649,17 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge
cleanup:
wget_html_free_urls_inline(&parsed);
xfree(utf8);
}
void html_parse_localfile(JOB *job, int level, const char *fname, const char *encoding, wget_iri_t *base)
{
char *data;
size_t n;
if ((data = wget_read_file(fname, NULL)))
html_parse(job, level, data, encoding, base);
if ((data = wget_read_file(fname, &n))) {
html_parse(job, level, data, n, encoding, base);
}
xfree(data);
}

View File

@ -1497,6 +1497,29 @@ static void test_stringmap(void)
}
static void test_striconv(void)
{
const char *utf8 = "abcßüäö";
char *utf16be, *utf16le, *result;
size_t n;
// convert utf-8 to utf-16be
if (wget_memiconv("utf-8", utf8, strlen(utf8), "UTF-16BE", &utf16be, &n) ||
wget_memiconv("UTF-16BE", utf16be, n, "UTF-16LE", &utf16le, &n) ||
wget_memiconv("UTF-16LE", utf16le, n, "UTF-8", &result, &n) ||
strcmp(utf8, result))
{
info_printf("Character conversion of '%s' failed (got '%s')\n", utf8, result);
failed++;
} else {
ok++;
}
xfree(result);
xfree(utf16le);
xfree(utf16be);
}
int main(int argc, const char **argv)
{
// if VALGRIND testing is enabled, we have to call ourselves with valgrind checking
@ -1530,6 +1553,7 @@ int main(int argc, const char **argv)
test_hashing();
test_vector();
test_stringmap();
test_striconv();
if (failed) {
info_printf("ERROR: %d out of %d basic tests failed\n", failed, ok + failed);