Commit e1cef456 authored by Elger Jonker's avatar Elger Jonker

Merge branch '30-discovery' into 'master'

Resolve "Verify that endpoints can die at header checks."

Closes #30

See merge request failmap/admin!29
parents c52cc8fa 02cfdbc7
......@@ -28,4 +28,5 @@ vendor/dnsrecon/
\ No newline at end of file
\ No newline at end of file
......@@ -8,7 +8,7 @@ from import (OrganizationRating, UrlRating, r
from failmap_admin.scanners.models import Endpoint
from failmap_admin.scanners.scanner_dns import brute_known_subdomains, certificate_transparency
from failmap_admin.scanners.scanner_http import scan_url_list_standard_ports
from failmap_admin.scanners.scanner_http import scan_urls_on_standard_ports
from failmap_admin.scanners.scanner_tls_qualys import ScannerTlsQualys
from .models import Coordinate, Organization, OrganizationType, Url
......@@ -153,7 +153,7 @@ class UrlAdmin(admin.ModelAdmin):
def discover_http_endpoints(self, request, queryset):
urls_to_scan = [url for url in queryset]
self.message_user(request, "URL(s) have been scanned for HTTP")
import logging
from import BaseCommand
from failmap_admin.organizations.models import Organization, Url
from failmap_admin.scanners.models import Endpoint
from failmap_admin.scanners.scanner_http import scan_url, scan_urls
from .support.arguments import add_discover_verify, add_organization_argument
logger = logging.getLogger(__package__)
# todo: add command line arguments: port and protocol.
class Command(BaseCommand):
help = 'Discover http(s) endpoints on well known ports.'
def add_arguments(self, parser):
def handle(self, *args, **options):
# some expansion magic to avoid using eval
func = "verify_existing_endpoints" if options['method'] == "verify" else "discover_endpoints"
functionlist = {"verify_existing_endpoints": verify_existing_endpoints,
"discover_endpoints": discover_endpoints}
if not options['organization']:
if options['organization'][0] == "_ALL_":
organization = Organization.objects.all().filter(name=options['organization'][0])
def verify_existing_endpoints(port=None, protocol=None, organization=None):
Checks all http(s) endpoints if they still exist. This is to monitor changes in the existing
dataset, without contacting an organization too often. It can be checked every few days,
as trying to find new endpoints is more involved and should not be run more than once every
two to four weeks.
The only result this scanner has is the same or less endpoints than we currently have.
:return: None
endpoints = Endpoint.objects.all().filter(is_dead=False,
if port:
endpoints = endpoints.filter(port=port)
if protocol:
endpoints = endpoints.filter(protocol=protocol)
endpoints = endpoints.filter(protocol__in=['http', 'https'])
if organization:
endpoints = endpoints.filter(url__organization=organization)
for endpoint in endpoints:
scan_url(endpoint.url, endpoint.port, endpoint.protocol)
def discover_endpoints(port=None, protocol=None, organization=None):
:return: None
urls = Url.objects.all().filter(is_dead=False, not_resolvable=False).filter()
if organization:
urls = urls.filter(organization=organization)
if protocol:
protocols = [protocol]
protocols = ['http', 'https']
if port:
ports = [port]
# Yes, HTTP sites on port 443 exist, we've seen many of them. Not just warnings(!).
# Don't underestimate the flexibility of the internet.
ports = [80, 81, 82, 88, 443, 8008, 8080, 8088, 8443, 8888, 9443]
logger.debug("Going to scan %s urls." % urls.count())
scan_urls(urls, ports, protocols)
import logging
from django.core.exceptions import ObjectDoesNotExist
from import BaseCommand
from failmap_admin.organizations.models import Organization, Url
from failmap_admin.scanners.scanner_http import ScannerHttp
logger = logging.getLogger(__package__)
# todo: when tls scanner ends, it hangs.
# todo: add command line arguments: port and protocol.
class Command(BaseCommand):
help = 'Discover http sites'
def add_arguments(self, parser):
def handle(self, *args, **options):
# urls without endpoints
if not options['organization']:
urls_without_endpoints = \
not_resolvable=False, is_dead=False).exclude(
endpoint__protocol__in=['http', 'https'])
logger.debug("Found %s urls that don't have http(s) endpoints yet. "
% urls_without_endpoints.count())
if options['organization'] and options['organization'] == "_ALL_":
s = ScannerHttp()
s.scan_multithreaded(port=8443, protocol="https")
logging.debug("Looking for organization: %s" % options['organization'][0])
o = Organization.objects.get(name=options['organization'][0])
urls = Url.objects.all().filter(organization=o)
except ObjectDoesNotExist:
logging.debug("Organization was not found.")
import argparse
from django.core.exceptions import ObjectDoesNotExist
from failmap_admin.organizations.models import Organization
def add_organization_argument(parser):
return parser.add_argument(
'--organization', '-o',
help="Name of an organization, for example Arnhem. Prefix spaces with a backslash (\\)",
def add_discover_verify(parser):
return parser.add_argument(
'--method', '-m',
help="verify|discover. Verify checks all existing ones, discover tries to find new ones.",
def valid_organization(name):
if "_ALL_" in name:
return "_ALL_"
o = Organization.objects.get(name=name)
except ObjectDoesNotExist:
raise argparse.ArgumentTypeError("%s is not a valid organization or _ALL_" % name)
def valid_discover_verify(option):
if option == "verify" or option == "discover":
return option
raise argparse.ArgumentTypeError("Method can be either 'discover' or 'verify'. Given: " % option)
......@@ -33,98 +33,106 @@ import requests
from requests import ConnectTimeout, HTTPError, ReadTimeout, Timeout
from requests.exceptions import ConnectionError
from failmap_admin.organizations.models import Url
from failmap_admin.celery import app
from .models import Endpoint
logger = logging.getLogger(__package__)
# todo: separating finding IP adresses and endpoints.
def scan(self):
# clean url: add http and portnumber 80. Try other ports later.
urls = Url.objects.all()
for url in urls:
scan_url(url, 80, "http")
def validate_port(port):
if port > 65535 or port < 0:
logger.error("Invalid port number, must be between 0 and 65535. %s" % port)
raise ValueError("Invalid port number, must be between 0 and 65535. %s" % port)
def validate_protocol(protocol):
if protocol not in ["http", "https"]:
logger.error("Invalid protocol %s, options are: http, https" % protocol)
raise ValueError("Invalid protocol %s, options are: http, https" % protocol)
def scan_url_list_standard_ports(urls):
scan_url_list(urls, 443, 'https')
scan_url_list(urls, 80, 'http')
scan_url_list(urls, 8080, 'http')
scan_url_list(urls, 8443, 'https')
# ScannerHttp.scan_url_list(urls, 8088, 'http')
# ScannerHttp.scan_url_list(urls, 8888, 'http')
# ScannerHttp.scan_url_list(urls, 8008, 'http')
# ScannerHttp.scan_url_list(urls, 9443, 'https')
def scan_urls_on_standard_ports(urls):
scan_url(urls, [80, 81, 82, 88, 443, 8008, 8080, 8088, 8443, 8888, 9443], ['http', 'https'])
def scan_url_list(urls, port=80, protocol="http"):
from multiprocessing import Pool
pool = Pool(processes=8)
def scan_urls(urls, ports, protocols):
if not has_internet_connection():
logger.error("No internet connection! Try again later!")
if protocol not in ["http", "https"]:
logger.error("Invalid protocol %s, options are: http, https" % protocol)
for port in ports:
if port > 65535 or port < 0:
logger.error("Invalid port number, must be between 0 and 65535. %s" % port)
for protocol in protocols:
for url in urls:
pool.apply_async(scan_url, [url, port, protocol],
logger.debug("Closing pool")
logger.debug("Joining pool")
# put some distance between the times an url is contacted, so it is less pressuring
# therefore, we do this per port and protocol instead of per url.
for port in ports:
for protocol in protocols:
for url in urls:
scan_url(url, port, protocol)
def scan_multithreaded(port=80, protocol="http", only_new=False):
def scan_url(url, port=80, protocol="https"):
task = scan_url_task.s(url, port, protocol)
if not only_new:
urls = Url.objects.all() # scans ALL urls.
# todo: only new urls, those that don't have an endpoint on the protocol+port.
# not without _any_ endpoint, given that there will soon be endpoints for it.
# this also re-verifies all domains that explicitly don't have an endpoint on this
# port+protocol, which can be a bit slow. (we're not saving it reversely).
# todo: this is not correct yet.
urls = Url.objects.all().exclude(endpoint__port=port, endpoint__protocol=protocol)
urls = Url.objects.all()
scan_url_list(urls, port, protocol)
def database_debug():
# had the wrong env.
from django.db import connection
from failmap_admin import settings
def success_callback(x):"Success!")
sql = "SELECT name FROM sqlite_master WHERE type='table';"
cursor = connection.cursor()
rows = cursor.fetchall()
for row in rows:
def error_callback(x):
logger.error("Error callback!")
# Simple: if there is a http response (status code), there is a http server.
# There might be other protocols on standard ports.
# Even if the IP constantly changes, we know that a scanner will find something by url
# todo: check if we can scan https, due to our https lib not supporting "rest of world"
# todo: check headers using another scanner, don't use this one, even though it contacts
# the server (?)
# todo: further look into dig, which at the moment doesn't return more than what we have...
# We don't make endpoints for servers that don't exist: as opposed to qualys, since that
# scanner is slow. (perhaps we should in that case?)
# todo: option to not find IP's, only use existing ip's of endpoints / urls.
def scan_url_task(url, port=80, protocol="https"):
Searches for both IPv4 and IPv6 IP addresses / types.
The algorithm is very simple: if there is a http status code, or "a response" there is an
http(s) server. Some servers don't return a status code, others have problems with tls.
So you need either build something extremely robust, or make an easy assumption that there
could be a website there. Given the ports we scan, the probabilty of a website is extremely
def scan_url(url, port=80, protocol="https"):
We don't scan for the obsoleted S-HTTP protocol, only for http and https.
It's possible to have a TLS site on port 80 and a non-TLS site on port 443. We've seen those.
This function does not store all ports it couldn't contact. Would we do that, the amount
of endpoints that are not resolvable explodes. There is not really value in storing the
non-resolvable urls, as you need to re-scan everything an a while anyway.
If we would store this, it would be url * ports endpoints. Now it's roughly urls * 1.8.
TLS does not have to be succesful. We also store https sites where HTTPS completely or
partially fails. As long as there is a "sort of" response we just assume there is a
website there. Other TLS scanners can check what's wrong with the connection. Perhaps
this leads to some false positives or to some other wrong data.
The big question is: would some https sites only respond IF the right protocol (SSL 1) or
something like that is spoken to them? Do we need a "special" TLS implementation on our server?
Todo: futher look if DIG can be of value to us. Until now it seems not so.
Todo: remove IP from endpoints. (change for version 1.1)
domain = "%s://%s:%s" % (protocol, url.url, port)
logger.debug("Scanning http(s) server on: %s" % domain)
......@@ -168,14 +176,22 @@ def scan_url(url, port=80, protocol="https"):
except (ConnectTimeout, Timeout, ReadTimeout) as Ex:
logger.debug("%s: Timeout! - %s" % (url, Ex))
except (ConnectionRefusedError, ConnectionError, HTTPError) as Ex:
# ConnectionRefusedError: [Errno 61] Connection refused
# Some errors have our interests:
# BadStatusLine is an error, which signifies that the server gives an answer.
# Example: returning HTML, incompatible TLS (binary)
# CertificateError
# Example: wrong domain name for certificate
# certificate verify failed
# We don't care about certificate verification errors: it is a valid response.
Some errors really mean there is no site. Example is the ConnectionRefusedError: [Errno 61]
which means the endpoint can be killed.
There can be many, many, many errors that still can be translated into an existing site.
Until now we've found in responses:
- BadStatusLine
- CertificateError
- certificate verify failed
This all indicates that there is a service there. So this is stored.
# Nope: EOF occurred in violation of protocol
# Nope: also: fine, a response! :) - youll get an unexpected closed connection.
logger.debug("%s: NOPE! - %s" % (url, Ex))
......@@ -272,7 +288,6 @@ def endpoint_exists(url, port, protocol, ip):
def kill_endpoint(url, port, protocol, ip):
eps = Endpoint.objects.all().filter(url=url,
......@@ -86,7 +86,7 @@ try:
import django_extensions
INSTALLED_APPS += ['django_extensions']
except ImportError:
print("Django Extensions is not installed (not a dev setup?) Install if needed.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment