Files
wget2/tests/test-robots.c
Tim Rühsen 36b095fd64 Fix Robots Exclusion Standard
* include/libwget.h.in: Add function wget_list_getnext().
* libwget/list.c: Add function wget_list_getnext().
* libwget/robots.c: Fix memory leak.
* src/host.c (host_remove_job): Cleanup queue after downloading and
  scanning robots.txt.
* src/job.h (struct JOB): Add flag 'requested_by_user'.
* src/wget.c (add_url_to_queue): Set 'requested_by_user',
  (add_url): Fix checking for disallowed paths.
* tests/Makefile.am: Add test 'test-robots'.
* tests/test-robots.c: New test to prove robots functionality.

Special handling for automatic robots.txt jobs
==============================================
What can happen with --recursive and --span-hosts is that a document from hostA
has links to hostB. All these links might go into the hostB queue before robots.txt
is downloaded and parsed. To avoid downloading of 'disallowed' documents, the queue
for hostB has to be cleaned up right after downloading and parsing robots.txt.
Any links links that have been explicitly requested by the user are still downloaded.
2016-09-19 15:23:48 +02:00

125 lines
3.5 KiB
C

/*
* Copyright(c) 2015-2016 Free Software Foundation, Inc.
*
* This file is part of libwget.
*
* Libwget is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Libwget is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with libwget. If not, see <http://www.gnu.org/licenses/>.
*
*
* Testing basic robots.txt functionality
*
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdlib.h> // exit()
#include "libtest.h"
int main(void)
{
wget_test_url_t urls[]={
{ .name = "/robots.txt",
.code = "200 Dontcare",
.body =
"User-agent: Badboy\n"\
"Disallow: /\n"\
"\n"
"# a simple comment\n"\
"User-agent: *\n"\
"Disallow: /subdir2/\n"\
,
.headers = {
"Content-Type: text/plain",
}
},
{ .name = "/index.html",
.code = "200 Dontcare",
.body =
"<html><head><title>Main Page</title><body><p>A link to a" \
" <A href=\"http://localhost:{{port}}/secondpage.html\">second page</a>." \
" <a href=\"/subdir1/subpage1.html?query&param#frag\">page in subdir1</a>." \
" <a href=\"./subdir1/subpage2.html\">page in subdir1</a>." \
"</p></body></html>",
.headers = {
"Content-Type: text/html",
}
},
{ .name = "/secondpage.html",
.code = "200 Dontcare",
.body =
"<html><head><title>Main Page</title><base href=\"/subdir2/\"></head><body><p>A link to a" \
" <A href=\"../secondpage.html\">second page</a>." \
" <a href=\"subpage1.html?query&param#frag\">page in subdir2</a>." \
" <a href=\"./subpage2.html\">page in subdir2</a>." \
"</p></body></html>",
.headers = {
"Content-Type: text/html",
}
},
{ .name = "/subdir1/subpage1.html?query&param",
.code = "200 Dontcare",
.body = "sub1_1"
},
{ .name = "/subdir1/subpage2.html",
.code = "200 Dontcare",
.body = "sub1_2"
},
{ .name = "/subdir2/subpage1.html?query&param",
.code = "200 Dontcare",
.body = "sub2_1"
},
{ .name = "/subdir2/subpage2.html",
.code = "200 Dontcare",
.body = "sub2_2"
},
};
// functions won't come back if an error occurs
wget_test_start_server(
WGET_TEST_RESPONSE_URLS, &urls, countof(urls),
0);
// robots.txt forbids /subdir2/ for '*'
wget_test(
WGET_TEST_OPTIONS, "-r -nH",
WGET_TEST_REQUEST_URL, "index.html",
WGET_TEST_EXPECTED_ERROR_CODE, 0,
WGET_TEST_EXPECTED_FILES, &(wget_test_file_t []) {
{ urls[0].name + 1, urls[0].body },
{ urls[1].name + 1, urls[1].body },
{ urls[2].name + 1, urls[2].body },
{ urls[3].name + 1, urls[3].body },
{ urls[4].name + 1, urls[4].body },
{ NULL } },
0);
// robots.txt forbids /subdir2/ for '*', but we download user-requested page
wget_test(
WGET_TEST_OPTIONS, "-r -nH",
WGET_TEST_REQUEST_URLS, "index.html", "subdir2/subpage2.html", NULL,
WGET_TEST_EXPECTED_ERROR_CODE, 0,
WGET_TEST_EXPECTED_FILES, &(wget_test_file_t []) {
{ urls[0].name + 1, urls[0].body },
{ urls[1].name + 1, urls[1].body },
{ urls[2].name + 1, urls[2].body },
{ urls[3].name + 1, urls[3].body },
{ urls[4].name + 1, urls[4].body },
{ urls[6].name + 1, urls[6].body },
{ NULL } },
0);
exit(0);
}