Robots.txt processor trigger heap-use-after-free error
To reproduce this issue, use this code to create web server wit MHD, this is to simulate HTTP persistent connection:
#include <microhttpd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#define PAGE "<html><head><title>libmicrohttpd demo</title></head><body>libmicrohttpd demo</body></html>"
#define ROBOTS "User-agent: Badboy\nDisallow: /\n\n# a simple comment\nUser-agent: *\nDisallow: /directory/\n"
static int
ahc_echo (void *cls,
struct MHD_Connection *connection,
const char *url,
const char *method,
const char *version,
const char *upload_data, size_t *upload_data_size, void **ptr)
{
struct MHD_Response *response;
int ret;
if (!strcmp(url, "/robots.txt")) {
response = MHD_create_response_from_buffer (strlen (ROBOTS),
(void *) ROBOTS,
MHD_RESPMEM_PERSISTENT);
} else {
response = MHD_create_response_from_buffer (strlen (PAGE),
(void *) PAGE,
MHD_RESPMEM_PERSISTENT);
}
ret = MHD_queue_response (connection, MHD_HTTP_OK, response);
MHD_destroy_response (response);
return ret;
}
int
main (int argc, char *const *argv)
{
struct MHD_Daemon *d;
if (argc != 2)
{
printf ("%s PORT\n", argv[0]);
return 1;
}
d = MHD_start_daemon (MHD_USE_AUTO | MHD_USE_INTERNAL_POLLING_THREAD | MHD_USE_ERROR_LOG,
atoi (argv[1]),
NULL, NULL, &ahc_echo, NULL,
MHD_OPTION_CONNECTION_TIMEOUT, (unsigned int) 120,
MHD_OPTION_STRICT_FOR_CLIENT, (int) 1,
MHD_OPTION_END);
if (d == NULL)
return 1;
(void) getc (stdin);
MHD_stop_daemon (d);
return 0;
}
gcc -o example example.c -lmicrohttpd
./example 8000
Build Wget2 with configure options, I use commit 8b16045f from master:
./configure --enable-fsanitize-asan --enable-fsanitize-ubsan -C && make && sudo make install
And do:
wget2 -d -r -nH http://localhost:8000
22.230416.236 name=-nH value=0
22.230416.237 name=-nH value=0
22.230416.237 Local URI encoding = 'UTF-8'
22.230416.237 Input URI encoding = 'UTF-8'
22.230416.237 add HSTS codeload.github.com:443 (maxage=31536000, includeSubDomains=0)
22.230416.237 add HSTS linux.com:443 (maxage=16070400, includeSubDomains=0)
22.230416.237 add HSTS duckduckgo.com:443 (maxage=31536000, includeSubDomains=0)
22.230416.237 add HSTS raw.githubusercontent.com:443 (maxage=31536000, includeSubDomains=0)
22.230416.237 add HSTS www.gnu.org:443 (maxage=63072000, includeSubDomains=0)
22.230416.237 add HSTS gitlab.com:443 (maxage=31536000, includeSubDomains=0)
22.230416.237 add HSTS github.com:443 (maxage=31536000, includeSubDomains=1)
22.230416.238 Fetched HSTS data from '/home/didik/.wget-hsts'
22.230416.238 HPKP: entry 'github.com' is expired
22.230416.238 HPKP: skipping PIN entry: '*sha256 WoiWRyIOVNa9ihaBciRSC7XHjliYS9VwUGOIud4PB18='
22.230416.238 HPKP: skipping PIN entry: '*sha256 RRM1dGqnDFsCJXBTHky16vi1obOlCgFFn/yOhI/y+ho='
22.230416.238 HPKP: skipping PIN entry: '*sha256 k2v657xBsOVe1PQRwOsHsw3bsGT2VzIqz5K+59sNQws='
22.230416.238 HPKP: skipping PIN entry: '*sha256 K87oWBWM9UZfyddvDfoxL+8lpNyoUB2ptGtn0fv6G2Q='
22.230416.238 HPKP: skipping PIN entry: '*sha256 IQBnNBEiFuhj+8x6X8XLgh01V9Ic5/V3IRQLNFFc7v4='
22.230416.238 HPKP: skipping PIN entry: '*sha256 iie1VXtL7HzAMF+/PVPR9xzT80kQxdZeJ+zduCB3uj0='
22.230416.238 HPKP: skipping PIN entry: '*sha256 LvRiGEjRqfzurezaWuj8Wie2gyHMrW5Q06LspMnox7A='
22.230416.238 Fetched HPKP data from '/home/didik/.wget-hpkp'
22.230416.238 add TLS session data for localhost (maxage=64800, size=1315)
22.230416.238 Fetched TLS session data from '/home/didik/.wget-session'
22.230416.238 Fetched OCSP hosts from '/home/didik/.wget-ocsp_hosts'
22.230416.238 Fetched OCSP fingerprints from '/home/didik/.wget-ocsp'
22.230416.238 *url = http://localhost:8000
22.230416.238 *3 http://localhost:8000
22.230416.238 *url = /robots.txt
22.230416.238 path /robots.txt ->
22.230416.238 robots.txt
22.230416.239 *2 http://localhost:8000/robots.txt
22.230416.239 local filename = 'robots.txt'
22.230416.239 host_add_robotstxt_job: 0x60c00000b5c0 http://localhost:8000/robots.txt
22.230416.239 host_add_robotstxt_job: qsize 1 host-qsize=1
22.230416.239 local filename = 'index.html'
22.230416.239 host_add_job: job fname index.html
22.230416.239 host_add_job: 0x60d00000c9d0 http://localhost:8000
22.230416.239 host_add_job: qsize 2 host-qsize=2
22.230416.239 queue_size: qsize=2
22.230416.239 queue_size: qsize=2
22.230416.239 queue_size: qsize=2
22.230416.239 queue_size: qsize=2
22.230416.240 [0] action=1 pending=0 host=0x0
22.230416.240 qsize=2 blocked=0
22.230416.240 pause=-1500739456240
22.230416.240 dequeue robot job http://localhost:8000/robots.txt
22.230416.240 [1] action=1 pending=0 host=0x0
22.230416.240 resolving localhost:8000...
22.230416.240 qsize=2 blocked=0
22.230416.240 pause=-1500739456240
22.230416.240 robot job inuse
22.230416.241 has 127.0.0.1:8000
22.230416.241 Add dns cache entry localhost:8000
22.230416.241 trying 127.0.0.1:8000...
22.230416.241 established connection localhost
22.230416.242 cookie_create_request_header for host=localhost path=robots.txt
22.230416.242 # sent 220 bytes:
GET /robots.txt HTTP/1.1
Host: localhost
Accept-Encoding: gzip, deflate, bzip2, xz, lzma, br
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
User-Agent: wget2/1.0.0
Connection: keep-alive
22.230416.242 [0] action=2 pending=1 host=0x60700000de60
22.230416.242 ### req 0x61500002fb00 pending requests = 1
22.230416.242 nbytes 187 nread 0 102400
22.230416.242 # got header 96 bytes:
HTTP/1.1 200 OK
Connection: Keep-Alive
Content-Length: 87
Date: Sat, 22 Jul 2017 16:04:16 GMT
22.230416.243 method 2
22.230416.243 keep_alive=1
22.230416.243 host_remove_job: 0x60c00000b5c0
22.230416.243 host_remove_job: qsize=1 host->qsize=1
22.230416.243 [0] action=1 pending=0 host=0x60700000de60
22.230416.243 qsize=1 blocked=0
22.230416.243 pause=-1500739456243
22.230416.243 dequeue job http://localhost:8000
=================================================================
==23021==ERROR: AddressSanitizer: heap-use-after-free on address 0x610000007de2 at pc 0x7f01e888f2d5 bp 0x7f01df725c60 sp 0x7f01df725408
READ of size 5 at 0x610000007de2 thread T1
22.230416.243 main: wake up
22.230416.243 queue_size: qsize=1
#0 0x7f01e888f2d4 (/usr/lib/x86_64-linux-gnu/libasan.so.2+0x472d4)
#1 0x7f01e84ea1c9 in wget_strcmp /home/didik/wget2/libwget/utils.c:82
#2 0x41bd1a in try_connection /home/didik/wget2/src/wget.c:1150
#3 0x41c6c7 in establish_connection /home/didik/wget2/src/wget.c:1213
#4 0x42354c in downloader_thread /home/didik/wget2/src/wget.c:1705
#5 0x7f01e72906b9 in start_thread (/lib/x86_64-linux-gnu/libpthread.so.0+0x76b9)
#6 0x7f01e6fc63dc in clone (/lib/x86_64-linux-gnu/libc.so.6+0x1073dc)
0x610000007de2 is located 162 bytes inside of 178-byte region [0x610000007d40,0x610000007df2)
freed by thread T1 here:
#0 0x7f01e88e02ca in __interceptor_free (/usr/lib/x86_64-linux-gnu/libasan.so.2+0x982ca)
#1 0x7f01e84c1505 in wget_iri_free /home/didik/wget2/libwget/iri.c:287
#2 0x40cb7f in host_remove_job /home/didik/wget2/src/host.c:347
#3 0x423e46 in downloader_thread /home/didik/wget2/src/wget.c:1777
#4 0x7f01e72906b9 in start_thread (/lib/x86_64-linux-gnu/libpthread.so.0+0x76b9)
previously allocated by thread T0 here:
#0 0x7f01e88e0602 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.2+0x98602)
#1 0x7f01e84f1a12 in wget_malloc /home/didik/wget2/libwget/xalloc.c:85
#2 0x7f01e84c1742 in wget_iri_parse /home/didik/wget2/libwget/iri.c:350
#3 0x7f01e84c6b02 in wget_iri_parse_base /home/didik/wget2/libwget/iri.c:818
#4 0x40bf9e in host_add_robotstxt_job /home/didik/wget2/src/host.c:288
#5 0x414d52 in add_url_to_queue /home/didik/wget2/src/wget.c:456
#6 0x4197e3 in main /home/didik/wget2/src/wget.c:909
#7 0x7f01e6edf82f in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2082f)
Thread T1 created by T0 here:
#0 0x7f01e887e253 in pthread_create (/usr/lib/x86_64-linux-gnu/libasan.so.2+0x36253)
#1 0x7f01e84e6e6b in wget_thread_start /home/didik/wget2/libwget/thread.c:48
#2 0x41aa05 in main /home/didik/wget2/src/wget.c:1025
#3 0x7f01e6edf82f in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2082f)
SUMMARY: AddressSanitizer: heap-use-after-free ??:0 ??
Shadow bytes around the buggy address:
0x0c207fff8f60: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c207fff8f70: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c207fff8f80: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c207fff8f90: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c207fff8fa0: fa fa fa fa fa fa fa fa fd fd fd fd fd fd fd fd
=>0x0c207fff8fb0: fd fd fd fd fd fd fd fd fd fd fd fd[fd]fd fd fa
0x0c207fff8fc0: fa fa fa fa fa fa fa fa 00 00 00 00 00 00 00 00
0x0c207fff8fd0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0c207fff8fe0: fa fa fa fa fa fa fa fa fd fd fd fd fd fd fd fd
0x0c207fff8ff0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
0x0c207fff9000: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Heap right redzone: fb
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack partial redzone: f4
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
==23021==ABORTING
Current test suite won't reproduce because the server not using persistent connection.
While using Wget1:
wget -d -r -nH http://localhost:8000
Setting --recursive (recursive) to 1
Setting --no (addhostdir) to 0
DEBUG output created by Wget 1.19.1 on linux-gnu.
Reading HSTS entries from /home/didik/.wget-hsts
URI encoding = ‘UTF-8’
URI encoding = ‘UTF-8’
Enqueuing http://localhost:8000/ at depth 0
Queue count 1, maxcount 1.
[IRI Enqueuing ‘http://localhost:8000/’ with ‘UTF-8’
Dequeuing http://localhost:8000/ at depth 0
Queue count 0, maxcount 1.
Converted file name 'index.html' (UTF-8) -> 'index.html' (UTF-8)
--2017-07-22 23:04:31-- http://localhost:8000/
Resolving localhost (localhost)... 127.0.0.1
Caching localhost => 127.0.0.1
Connecting to localhost (localhost)|127.0.0.1|:8000... connected.
Created socket 4.
Releasing 0x0000000001d64fe0 (new refcount 1).
---request begin---
GET / HTTP/1.1
User-Agent: Wget/1.19.1 (linux-gnu)
Accept: */*
Accept-Encoding: identity
Host: localhost:8000
Connection: Keep-Alive
---request end---
HTTP request sent, awaiting response...
---response begin---
HTTP/1.1 200 OK
Connection: Keep-Alive
Content-Length: 90
Date: Sat, 22 Jul 2017 16:04:31 GMT
---response end---
200 OK
Registered socket 4 for persistent reuse.
Length: 90
Saving to: ‘index.html’
0K 100% 18.1M=0s
2017-07-22 23:04:31 (18.1 MB/s) - ‘index.html’ saved [90/90]
Loaded index.html (size 90).
no-follow in index.html: 0
FINISHED --2017-07-22 23:04:31--
Total wall clock time: 0.002s
Downloaded: 1 files, 90 in 0s (18.1 MB/s)
Based on error log above, it points to src/wget.c code:
if (!wget_strcmp(wget_http_get_host(conn), iri->host) &&
wget_http_get_scheme(conn) == iri->scheme &&
!wget_strcmp(wget_http_get_port(conn), iri->resolv_port))
I still try to find the root cause of this problem.