mirror of
https://gitlab.com/gnuwget/wget2.git
synced 2025-08-16 17:37:56 +00:00
Support parsing of UTF-16 HTML documents
* examples/print_html_urls.c (html_parse_localfile): Detect BOM (Byte Order Mark), convert UTF-16 to UTF-8 before parsing. * include/libwget.h.in: Remove wget_charset_transcode(), add wget_memiconv() and wget_striconv(). * libwget/encoding.c: Implement wget_memiconv() and wget_striconv() * src/wget.c (html_parse): Add 'html_len' param, convert UTF-16 to UTF-8 before parsing. * tests/test.c: New test for wget_memiconv().
This commit is contained in:
@ -38,12 +38,57 @@
|
||||
static void html_parse_localfile(const char *fname)
|
||||
{
|
||||
char *data;
|
||||
const char *encoding = NULL;
|
||||
size_t len;
|
||||
|
||||
if ((data = wget_read_file(fname, &len))) {
|
||||
if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) {
|
||||
// Big-endian UTF-16
|
||||
encoding = "UTF-16BE";
|
||||
|
||||
// adjust behind BOM, ignore trailing single byte
|
||||
data += 2;
|
||||
len -= 2;
|
||||
} else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) {
|
||||
// Little-endian UTF-16
|
||||
encoding = "UTF-16LE";
|
||||
|
||||
// adjust behind BOM
|
||||
data += 2;
|
||||
len -= 2;
|
||||
} else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) {
|
||||
// UTF-8
|
||||
encoding = "UTF-8";
|
||||
|
||||
// adjust behind BOM
|
||||
data += 3;
|
||||
len -= 3;
|
||||
}
|
||||
|
||||
if (encoding)
|
||||
printf("URI encoding '%s' set by BOM\n", encoding);
|
||||
|
||||
if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
|
||||
size_t n;
|
||||
char *utf8;
|
||||
|
||||
len -= len & 1; // ignore single trailing byte, else charset conversion fails
|
||||
|
||||
if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) {
|
||||
printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding);
|
||||
data = utf8;
|
||||
} else {
|
||||
printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if ((data = wget_read_file(fname, NULL))) {
|
||||
WGET_HTML_PARSED_RESULT *res = wget_html_get_urls_inline(data, NULL, NULL);
|
||||
|
||||
if (res->encoding)
|
||||
printf("URI encoding '%s'\n", res->encoding);
|
||||
if (encoding) {
|
||||
if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding))
|
||||
printf("Encoding '%s' as stated in document has been ignored\n", encoding);
|
||||
}
|
||||
|
||||
for (int it = 0; it < wget_vector_size(res->uris); it++) {
|
||||
WGET_HTML_PARSED_URL *html_url = wget_vector_get(res->uris, it);
|
||||
|
@ -330,8 +330,10 @@ int
|
||||
int (*load_func)(void *, FILE *fp), int (*save_func)(void *, FILE *fp), void *context) LIBWGET_EXPORT;
|
||||
const char
|
||||
*wget_local_charset_encoding(void) LIBWGET_EXPORT;
|
||||
int
|
||||
wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen) LIBWGET_EXPORT;
|
||||
char *
|
||||
wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding) G_GNUC_WGET_MALLOC LIBWGET_EXPORT;
|
||||
wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding) G_GNUC_WGET_MALLOC LIBWGET_EXPORT;
|
||||
int
|
||||
wget_str_needs_encoding(const char *s) G_GNUC_WGET_NONNULL((1)) G_GNUC_WGET_PURE LIBWGET_EXPORT;
|
||||
int
|
||||
|
@ -66,10 +66,11 @@ const char *wget_local_charset_encoding(void)
|
||||
return strdup("ASCII");
|
||||
}
|
||||
|
||||
char *wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding)
|
||||
// void *wget_memiconv(const void *src, size_t length, const char *src_encoding, const char *dst_encoding)
|
||||
int wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen)
|
||||
{
|
||||
if (!src)
|
||||
return NULL;
|
||||
return -1;
|
||||
|
||||
#ifdef HAVE_ICONV
|
||||
if (!src_encoding)
|
||||
@ -77,35 +78,64 @@ char *wget_charset_transcode(const char *src, const char *src_encoding, const ch
|
||||
if (!dst_encoding)
|
||||
dst_encoding = "iso-8859-1"; // default character-set for most browsers
|
||||
|
||||
if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
|
||||
char *ret = NULL;
|
||||
int ret = -1;
|
||||
|
||||
iconv_t cd=iconv_open(dst_encoding, src_encoding);
|
||||
if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
|
||||
iconv_t cd = iconv_open(dst_encoding, src_encoding);
|
||||
|
||||
if (cd != (iconv_t)-1) {
|
||||
char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself
|
||||
size_t tmp_len = strlen(src);
|
||||
size_t tmp_len = srclen;
|
||||
size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
|
||||
char *dst = xmalloc(dst_len + 1), *dst_tmp = dst;
|
||||
|
||||
if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
|
||||
&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
|
||||
{
|
||||
ret = wget_strmemdup(dst, dst_len - dst_len_tmp);
|
||||
debug_printf("converted '%s' (%s) -> '%s' (%s)\n", src, src_encoding, ret, dst_encoding);
|
||||
} else
|
||||
error_printf(_("Failed to convert '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
|
||||
debug_printf("transcoded %zu bytes from '%s' to '%s'\n", srclen, src_encoding, dst_encoding);
|
||||
if (out) {
|
||||
*out = xrealloc(dst, dst_len - dst_len_tmp + 1);
|
||||
(*out)[dst_len - dst_len_tmp] = 0;
|
||||
}
|
||||
if (outlen)
|
||||
*outlen = dst_len - dst_len_tmp;
|
||||
ret = 0; // return OK
|
||||
} else {
|
||||
error_printf(_("Failed to transcode '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
|
||||
xfree(dst);
|
||||
if (out)
|
||||
*out = NULL;
|
||||
if (outlen)
|
||||
*outlen = 0;
|
||||
}
|
||||
|
||||
xfree(dst);
|
||||
iconv_close(cd);
|
||||
} else
|
||||
error_printf(_("Failed to prepare encoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
|
||||
error_printf(_("Failed to prepare transcoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
return strdup(src);
|
||||
if (out)
|
||||
*out = wget_strmemdup(src, srclen);
|
||||
if (outlen)
|
||||
*outlen = srclen;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// src must be a ASCII compatible C string
|
||||
char *wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding)
|
||||
{
|
||||
if (!src)
|
||||
return NULL;
|
||||
|
||||
char *dst;
|
||||
if (wget_memiconv(src_encoding, src, strlen(src), dst_encoding, &dst, NULL))
|
||||
return NULL;
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
int wget_str_needs_encoding(const char *s)
|
||||
@ -143,12 +173,12 @@ int wget_str_is_valid_utf8(const char *utf8)
|
||||
|
||||
char *wget_str_to_utf8(const char *src, const char *encoding)
|
||||
{
|
||||
return wget_charset_transcode(src, encoding, "utf-8");
|
||||
return wget_striconv(src, encoding, "utf-8");
|
||||
}
|
||||
|
||||
char *wget_utf8_to_str(const char *src, const char *encoding)
|
||||
{
|
||||
return wget_charset_transcode(src, "utf-8", encoding);
|
||||
return wget_striconv(src, "utf-8", encoding);
|
||||
}
|
||||
|
||||
#ifdef WITH_LIBIDN
|
||||
|
78
src/wget.c
78
src/wget.c
@ -127,7 +127,7 @@ static void
|
||||
rss_parse(JOB *job, const char *data, const char *encoding, wget_iri_t *base),
|
||||
rss_parse_localfile(JOB *job, const char *fname, const char *encoding, wget_iri_t *base),
|
||||
metalink_parse_localfile(const char *fname),
|
||||
html_parse(JOB *job, int level, const char *data, const char *encoding, wget_iri_t *base),
|
||||
html_parse(JOB *job, int level, const char *data, size_t len, const char *encoding, wget_iri_t *base),
|
||||
html_parse_localfile(JOB *job, int level, const char *fname, const char *encoding, wget_iri_t *base),
|
||||
css_parse(JOB *job, const char *data, const char *encoding, wget_iri_t *base),
|
||||
css_parse_localfile(JOB *job, const char *fname, const char *encoding, wget_iri_t *base);
|
||||
@ -1383,9 +1383,9 @@ void *downloader_thread(void *p)
|
||||
if (config.recursive && (!config.level || job->level < config.level + config.page_requisites)) {
|
||||
if (resp->content_type) {
|
||||
if (!wget_strcasecmp_ascii(resp->content_type, "text/html")) {
|
||||
html_parse(job, job->level, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
|
||||
html_parse(job, job->level, resp->body->data, resp->body->length, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
|
||||
} else if (!wget_strcasecmp_ascii(resp->content_type, "application/xhtml+xml")) {
|
||||
html_parse(job, job->level, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
|
||||
html_parse(job, job->level, resp->body->data, resp->body->length, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
|
||||
// xml_parse(sockfd, resp, job->iri);
|
||||
} else if (!wget_strcasecmp_ascii(resp->content_type, "text/css")) {
|
||||
css_parse(job, resp->body->data, resp->content_type_encoding ? resp->content_type_encoding : config.remote_encoding, job->iri);
|
||||
@ -1500,16 +1500,14 @@ static unsigned int G_GNUC_WGET_PURE hash_url(const char *url)
|
||||
return hash;
|
||||
}
|
||||
|
||||
void html_parse(JOB *job, int level, const char *html, const char *encoding, wget_iri_t *base)
|
||||
void html_parse(JOB *job, int level, const char *html, size_t html_len, const char *encoding, wget_iri_t *base)
|
||||
{
|
||||
WGET_HTML_PARSED_RESULT *parsed = wget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags);
|
||||
wget_iri_t *allocated_base = NULL;
|
||||
const char *reason;
|
||||
char *utf8 = NULL;
|
||||
wget_buffer_t buf;
|
||||
char sbuf[1024];
|
||||
|
||||
if (config.robots && !parsed->follow)
|
||||
goto cleanup;
|
||||
int convert_links = config.convert_links && !config.delete_after;
|
||||
|
||||
// http://www.whatwg.org/specs/web-apps/current-work/, 12.2.2.2
|
||||
if (encoding && encoding == config.remote_encoding) {
|
||||
@ -1519,33 +1517,62 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge
|
||||
// Big-endian UTF-16
|
||||
encoding = "UTF-16BE";
|
||||
reason = _("set by BOM");
|
||||
|
||||
// adjust behind BOM, ignore trailing single byte
|
||||
html += 2;
|
||||
html_len -= 2;
|
||||
} else if ((unsigned char)html[0] == 0xFF && (unsigned char)html[1] == 0xFE) {
|
||||
// Little-endian UTF-16
|
||||
encoding = "UTF-16LE";
|
||||
reason = _("set by BOM");
|
||||
|
||||
// adjust behind BOM
|
||||
html += 2;
|
||||
html_len -= 2;
|
||||
} else if ((unsigned char)html[0] == 0xEF && (unsigned char)html[1] == 0xBB && (unsigned char)html[2] == 0xBF) {
|
||||
// UTF-8
|
||||
encoding = "UTF-8";
|
||||
reason = _("set by BOM");
|
||||
|
||||
// adjust behind BOM
|
||||
html += 3;
|
||||
html_len -= 3;
|
||||
} else {
|
||||
reason = _("set by server response");
|
||||
}
|
||||
}
|
||||
|
||||
if (!wget_strncasecmp(parsed->encoding, "UTF-16", 6) || !wget_strncasecmp(encoding, "UTF-16", 6)) {
|
||||
// http://www.whatwg.org/specs/web-apps/current-work/, 12.2.2.2
|
||||
// we found an encoding in the HTML, so it can't be UTF-16*.
|
||||
encoding = "UTF-8";
|
||||
reason = _("wrong stated UTF-16* changed to UTF-8");
|
||||
}
|
||||
if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
|
||||
size_t n;
|
||||
|
||||
if (!encoding) {
|
||||
if (parsed->encoding) {
|
||||
encoding = parsed->encoding;
|
||||
reason = _("set by document");
|
||||
} else {
|
||||
encoding = "CP1252"; // default encoding for HTML5 (pre-HTML5 is iso-8859-1)
|
||||
reason = _("default, encoding not specified");
|
||||
html_len -= html_len & 1; // ignore single trailing byte, else charset conversion fails
|
||||
|
||||
if (wget_memiconv(encoding, html, html_len, "UTF-8", &utf8, &n) == 0) {
|
||||
info_printf(_("Convert non-ASCII encoding '%s' (%s) to UTF-8\n"), encoding, reason);
|
||||
html = utf8;
|
||||
if (convert_links) {
|
||||
convert_links = 0; // prevent link conversion
|
||||
info_printf(_("Link conversion disabled for '%s'\n"), job->local_filename);
|
||||
}
|
||||
|
||||
} else {
|
||||
info_printf(_("Failed to convert non-ASCII encoding '%s' (%s) to UTF-8, skip parsing\n"), encoding, reason);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
WGET_HTML_PARSED_RESULT *parsed = wget_html_get_urls_inline(html, config.follow_tags, config.ignore_tags);
|
||||
|
||||
if (config.robots && !parsed->follow)
|
||||
goto cleanup;
|
||||
|
||||
if (!encoding) {
|
||||
if (parsed->encoding) {
|
||||
encoding = parsed->encoding;
|
||||
reason = _("set by document");
|
||||
} else {
|
||||
encoding = "CP1252"; // default encoding for HTML5 (pre-HTML5 is iso-8859-1)
|
||||
reason = _("default, encoding not specified");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1609,7 +1636,7 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge
|
||||
|
||||
wget_buffer_deinit(&buf);
|
||||
|
||||
if (config.convert_links && !config.delete_after) {
|
||||
if (convert_links && !config.delete_after) {
|
||||
for (int it = 0; it < wget_vector_size(parsed->uris); it++) {
|
||||
WGET_HTML_PARSED_URL *html_url = wget_vector_get(parsed->uris, it);
|
||||
html_url->url.p = (const char *) (html_url->url.p - html); // convert pointer to offset
|
||||
@ -1622,14 +1649,17 @@ void html_parse(JOB *job, int level, const char *html, const char *encoding, wge
|
||||
|
||||
cleanup:
|
||||
wget_html_free_urls_inline(&parsed);
|
||||
xfree(utf8);
|
||||
}
|
||||
|
||||
void html_parse_localfile(JOB *job, int level, const char *fname, const char *encoding, wget_iri_t *base)
|
||||
{
|
||||
char *data;
|
||||
size_t n;
|
||||
|
||||
if ((data = wget_read_file(fname, NULL)))
|
||||
html_parse(job, level, data, encoding, base);
|
||||
if ((data = wget_read_file(fname, &n))) {
|
||||
html_parse(job, level, data, n, encoding, base);
|
||||
}
|
||||
|
||||
xfree(data);
|
||||
}
|
||||
|
24
tests/test.c
24
tests/test.c
@ -1497,6 +1497,29 @@ static void test_stringmap(void)
|
||||
|
||||
}
|
||||
|
||||
static void test_striconv(void)
|
||||
{
|
||||
const char *utf8 = "abcßüäö";
|
||||
char *utf16be, *utf16le, *result;
|
||||
size_t n;
|
||||
|
||||
// convert utf-8 to utf-16be
|
||||
if (wget_memiconv("utf-8", utf8, strlen(utf8), "UTF-16BE", &utf16be, &n) ||
|
||||
wget_memiconv("UTF-16BE", utf16be, n, "UTF-16LE", &utf16le, &n) ||
|
||||
wget_memiconv("UTF-16LE", utf16le, n, "UTF-8", &result, &n) ||
|
||||
strcmp(utf8, result))
|
||||
{
|
||||
info_printf("Character conversion of '%s' failed (got '%s')\n", utf8, result);
|
||||
failed++;
|
||||
} else {
|
||||
ok++;
|
||||
}
|
||||
|
||||
xfree(result);
|
||||
xfree(utf16le);
|
||||
xfree(utf16be);
|
||||
}
|
||||
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
// if VALGRIND testing is enabled, we have to call ourselves with valgrind checking
|
||||
@ -1530,6 +1553,7 @@ int main(int argc, const char **argv)
|
||||
test_hashing();
|
||||
test_vector();
|
||||
test_stringmap();
|
||||
test_striconv();
|
||||
|
||||
if (failed) {
|
||||
info_printf("ERROR: %d out of %d basic tests failed\n", failed, ok + failed);
|
||||
|
Reference in New Issue
Block a user