Commit ed2f5305 authored by Sophie Brun's avatar Sophie Brun

Merge tag 'upstream/1.5.3'

Upstream version 1.5.3
parents ca8f9ec6 ffbebaf8
2014-08-20 Basic <simsong@r4.ncr.nps.edu>
* src/scan_pipe.cpp (scan_pipe): removed scan_pipe (since you don't want to fork with threads)
2014-08-12 Man Page <simsong@mncrnpsedu.local>
* doc/announce_1.5.2.md: annouced release 1.5.2
2014-08-03 Basic <simsong@r4.ncr.nps.edu>
* configure.ac: incremented version number
......
......@@ -7,7 +7,7 @@
# and http://www.openismus.com/documents/linux/automake/automake.shtml
AC_PREREQ(2.57)
AC_INIT(BULK_EXTRACTOR, 1.5.1, bugs@digitalcorpora.org)
AC_INIT(BULK_EXTRACTOR, 1.5.3, bugs@digitalcorpora.org)
AC_CONFIG_MACRO_DIR(m4)
......
This diff is collapsed.
......@@ -46,7 +46,6 @@ def process(out,dname1,dname2):
print(" %s (%d lines)" % (f,a.count_lines(f)))
else:
print(" %s" % (f))
print("")
# Report interesting differences based on the historgrams.
# Output Example:
......@@ -56,8 +55,9 @@ def process(out,dname1,dname2):
8 17 9 steve@mac.com
11 16 5 bobsmith@hotmail.com
"""
common_files = b1.files.intersection(b2.files)
histogram_files = filter(lambda a:"histogram" in a,common_files)
b1_histograms = set(b1.histogram_files())
b2_histograms = set(b2.histogram_files())
common_histograms = b1_histograms.intersection(b2_histograms)
if options.html:
out.write("<ul>\n")
......@@ -65,8 +65,8 @@ def process(out,dname1,dname2):
out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file))
out.write("</ul>\n<hr/>\n")
diffcount = 0
for histogram_file in sorted(histogram_files):
for histogram_file in sorted(common_histograms):
diffcount = 0
if options.html:
out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file))
else:
......@@ -106,7 +106,31 @@ def process(out,dname1,dname2):
out.write("{}: No differences\n".format(histogram_file))
else:
out.write("{}: No differences\n".format(histogram_file))
if options.features:
for feature_file in b1.feature_files():
if feature_file not in b2.feature_files():
continue
print("Compare features",feature_file)
for p in [1,2]:
if p==1:
a = b1; b = b2
else:
a = b2; b = a
a_features = {}
for line in a.open(feature_file):
r = bulk_extractor_reader.parse_feature_line(line)
if not r: continue
a_features[r[0]] = r[1]
for line in b.open(feature_file):
r = bulk_extractor_reader.parse_feature_line(line)
if not r: continue
if r[0] not in a_features:
print("{} {} is only in {}".format(r[0],r[1],a.name))
if __name__=="__main__":
from optparse import OptionParser
......@@ -122,6 +146,7 @@ if __name__=="__main__":
parser.add_option("--same",help="Also show values that didn't change",action="store_true")
parser.add_option("--tabdel",help="Specify a tab-delimited output file for easy import into Excel")
parser.add_option("--html",help="HTML output. Argument is file name base")
parser.add_option("--features",help="Compare feature files also",action='store_true')
(options,args) = parser.parse_args()
if len(args)!=2:
......
......@@ -99,6 +99,7 @@ def parse_feature_line(line):
if len(ary)<MIN_FIELDS_PER_FEATURE_FILE_LINE or len(ary)>MAX_FIELDS_PER_FEATURE_FILE_LINE:
# Don't know
return None
if b"\xf4\x80\x80\x9c" in ary[0]: return ary # contains files
if len(ary[0])<1: return None
if ary[0][0]<ord('0') or ary[0][0]>ord('9'): return None
return ary
......@@ -124,7 +125,7 @@ def is_feature_filename(fname):
if "_stopped" in fname: return False
if "_tags" in fname: return False
if "wordlist" in fname: return False
if "alerts.txt" in fname: return False
#if "alerts.txt" in fname: return False
return None # don't know
......@@ -138,6 +139,7 @@ class BulkReport:
b.histogram_files() - Set of histogram names
b.feature_files()
b.files - Set of all files
b.get_features(fname) - just get the features
"""
def __init__(self,fn,do_validate=True):
......@@ -190,7 +192,7 @@ class BulkReport:
return
if fn.endswith(".txt"):
import sys
print("***\n*** {} ends with .txt\n*** BulkReader wants the report directory, not the individual feature file\n***".format(fn),file=sys.stderr)
print("***\n*** {} ends with .txt\n*** BulkReader wants the report directory, not the individual feature file\n***".format(fn))
raise RuntimeError("Cannot process " + fn)
def image_filename(self):
......@@ -249,7 +251,6 @@ class BulkReport:
count += 1
return count
def is_histogram_file(self,fn):
if is_histogram_filename(fn)==True: return True
for line in self.open(fn,'r'):
......@@ -283,10 +284,9 @@ class BulkReport:
def carved_files(self):
return sorted(filter(lambda fn:"/" in fn,self.files))
def read_histogram(self,fn):
def read_histogram_entries(self,fn):
"""Read a histogram file and return a dictonary of the histogram. Removes \t(utf16=...) """
import re
ret = {}
r = re.compile(b"^n=(\d+)\t(.*)$")
for line in self.open(fn,'r'):
# line = line.decode('utf-8')
......@@ -295,8 +295,24 @@ class BulkReport:
k = m.group(2)
p = k.find(b"\t")
if p>0: k = k[0:p]
ret[k] = int(m.group(1))
yield (k,int(m.group(1)))
def read_histogram(self,fn):
"""Read a histogram file and return a dictonary of the histogram. Removes \t(utf16=...) """
ret = {}
for (k,v) in self.read_histogram_entries(fn):
ret[k] = int(m.group(1))
return ret
def read_features(self,fname):
"""Just read the features out of a feature file"""
"""Usage: for (pos,feature,context) in br.read_features("fname")"""
for line in self.open(fname):
r = parse_feature_line(line)
if r:
yield r
if(__name__=='__main__'):
from optparse import OptionParser
......
This diff is collapsed.
......@@ -33,9 +33,6 @@ sys.path.append("../lib/") # add the library
import ttable, bulk_extractor_reader
# This regular expression matches on bulk_extractor offset and item
feature_re = re.compile("([^\t]+)\t([^\t\n\r]+)")
class Correlator:
"""The main correlator class.
Correlates features on different disks.
......@@ -53,7 +50,7 @@ class Correlator:
return max([len(s) for s in self.drives])
def longest_feature_name(self):
return max([len(s) for s in self.items.keys()])
return max([len(s) for s in self.features.keys()])
def ingest_feature_file(self,f,context_stop_list):
"""Read the lines in a feature file; returns how many lines were procesed"""
......@@ -101,17 +98,17 @@ class Correlator:
featuredict = self.features[feature]
featuredict[drivename] = featuredict.get(drivename,0)+count
def print_stats(self,f):
def dump_stats(self,f):
f.write("Total Drives: {}\n".format(len(self.drives)))
f.write("Distinct {} features: {}\n".format(self.name,len(self.items)))
fmt = "{:" + str(self.longest_feature_name()) + "} {}"
f.write(fmt.format("Feature","Drive Count"))
f.write("Distinct {} features: {}\n".format(self.name,len(self.features)))
fmt = "{:" + str(self.longest_feature_name()) + "} {} {}\n"
f.write(fmt.format("Feature","Count","Drives"))
def keysortfun(k):
return (-len(self.features[k]),k)
for d in sorted(self.features.keys(),key=keysortfun):
f.write(fmt.format(d,len(self.features[d])))
f.write(fmt.format(d,len(self.features[d]),self.features[d]))
if(__name__=="__main__"):
......@@ -125,6 +122,7 @@ if(__name__=="__main__"):
parser.add_argument("--makecombined",help="Combine multiple feature files into a single context stop list with no offests",action='store_true')
parser.add_argument("--idfeatures",help="Specifies feature files used for identity operations",
type=str,default="email,ccn,telephone")
parser.add_argument('--dump',help='Dump the CDA database',action='store_true')
parser.add_argument('reports', type=str, nargs='+', help='bulk_extractor report directories or ZIP files')
args = parser.parse_args()
......@@ -170,9 +168,14 @@ if(__name__=="__main__"):
for (feature,context) in context_stop_list:
f.write("".join(['','\t',feature,'\t',context,'\n']))
print("Created {} with {} lines\n".format(fn,len(context_stop_list)))
if args.makecombined:
print("DONE")
exit(0)
print("DONE")
exit(0)
if args.dump:
for c in correlators:
c.dump_stats(sys.stdout)
# Does the user want to make a stoplist?
if args.makestop:
......
#!/usr/bin/env python3
#
# NIST makes available at http://www.nsrl.nist.gov/morealgs.htm
# A text file is available which relates the MD5 of a complete file to the MD5 of the first 4096 bytes in the file (provided the file is that large).
# A 426 MB Zip file can be downloaded which contains a 823 MB text file with 13,112,687 rows.
# Each row has an MD5 hash of the entire file, a tab character, and an MD5 hash of the first 4k.
#
# This program scans the file and scans a bulk_extractor winpe.txt feature file
# to determine if the Windows executables are in NSRL
import sys
md5s = {}
for line in open("md5b4096.txt"):
try:
(full,first) = line[0:-1].split("\t")
md5s[first] = full
except ValueError:
pass
for fn in sys.argv[1:]:
print("reading {}".format(fn))
for line in open(fn):
if line[0]=='#': continue
(offset,md5,xml) = line.split("\t")[0:3]
md5 = md5.upper()
try:
print(offset,md5,md5s[md5],xml[0:50])
except KeyError:
print("NOT IN NSRL: ",line)
pass
......@@ -66,6 +66,7 @@ bulk_scanners = \
scan_outlook.cpp \
scan_facebook.cpp \
scan_pdf.cpp \
scan_msxml.cpp \
scan_rar.cpp \
scan_hashdb.cpp \
scan_sqlite.cpp \
......
......@@ -148,6 +148,11 @@ b64_pton_forensic(char const *src, int srclen, unsigned char *target, size_t tar
if (ch == Pad64) break;
/* HANDLE RFC4648 */
if(ch=='-') ch='+';
if(ch=='_') ch='/';
#ifdef HAVE_CONFORMING_STRCHR
pos = strchr(Base64, ch);
#else
......
......@@ -219,6 +219,7 @@ extern "C" scanner_t scan_lightgrep;
#endif
extern "C" scanner_t scan_facebook;
extern "C" scanner_t scan_pdf;
extern "C" scanner_t scan_msxml;
extern "C" scanner_t scan_winlnk;
extern "C" scanner_t scan_winpe;
extern "C" scanner_t scan_winprefetch;
......
......@@ -71,6 +71,7 @@ scanner_t *scanners_builtin[] = {
scan_gzip,
scan_outlook,
scan_pdf,
scan_msxml,
scan_winpe,
scan_hiberfile,
scan_winlnk,
......
......@@ -10,6 +10,7 @@ static const uint32_t B64_NUMBER=4;
static const uint32_t B64_SYMBOL=8;
static int base64array[256]; // array of valid base64 characters,
static size_t minlinewidth = 60;
static size_t maxlinewidth_needed_for_character_classes = 160;
inline bool isbase64(unsigned char ch)
......@@ -25,7 +26,7 @@ inline bool isbase64(unsigned char ch)
* @param len - the length of the line
* @return true - a line was found; false - a line was not found
*/
inline bool sbuf_getline(const sbuf_t &sbuf,size_t &pos,size_t &start,size_t &len)
inline bool sbuf_getline(const sbuf_t &sbuf,size_t &pos,size_t &line_start,size_t &line_len)
{
/* Scan forward until pos is at the beginning of a line */
if(pos >= sbuf.pagesize) return false;
......@@ -35,14 +36,14 @@ inline bool sbuf_getline(const sbuf_t &sbuf,size_t &pos,size_t &start,size_t &le
}
if(pos >= sbuf.pagesize) return false; // didn't find another start of a line
}
start = pos;
line_start = pos;
/* Now scan to end of the line, or the end of the buffer */
while(++pos < sbuf.pagesize){
if(sbuf[pos]=='\n'){
break;
}
}
len = (pos-start);
line_len = (pos-line_start);
return true;
}
......@@ -61,7 +62,10 @@ inline bool sbuf_line_is_base64(const sbuf_t &sbuf,const size_t &start,const siz
}
if (inequal) return false; // after we find an equal, only space is acceptable
uint8_t ch = sbuf[i];
if (base64array[ch]==0) return false;// non base64 character
if (base64array[ch]==0){
//fprintf(stderr,"NON CHAR '%c'\n",ch);
return false;// non base64 character
}
b64_classes |= base64array[ch]; // record the classes we have found
if (ch!='A') only_A = false;
}
......@@ -75,10 +79,12 @@ inline bool sbuf_line_is_base64(const sbuf_t &sbuf,const size_t &start,const siz
* capital As, which is commonly seen in BASE64 (because all capital As are nulls)
*/
if (only_A) return true; // all capital As are true
if ((b64_classes & B64_UPPERCASE)==0) return false; // must have an uppercase character
if ((b64_classes & B64_LOWERCASE)==0) return false; // must have an lowercase character
if(len>maxlinewidth_needed_for_character_classes){
if (only_A) return true; // all capital As are true
if ((b64_classes & B64_UPPERCASE)==0) return false; // must have an uppercase character
if ((b64_classes & B64_LOWERCASE)==0) return false; // must have an lowercase character
}
//fprintf(stderr,"OK\n");
return true;
}
......@@ -111,6 +117,8 @@ inline void process(const class scanner_params &sp,const recursion_control_block
extern "C"
void scan_base64(const class scanner_params &sp,const recursion_control_block &rcb)
{
const int debug=0;
assert(sp.sp_version==scanner_params::CURRENT_SP_VERSION);
if(sp.phase==scanner_params::PHASE_STARTUP){
assert(sp.info->si_version==scanner_info::CURRENT_SI_VERSION);
......@@ -123,6 +131,8 @@ void scan_base64(const class scanner_params &sp,const recursion_control_block &r
memset(base64array,0,sizeof(base64array));
base64array[(int)'+'] = B64_SYMBOL;
base64array[(int)'/'] = B64_SYMBOL;
base64array[(int)'-'] = B64_SYMBOL; // RFC 4648
base64array[(int)'_'] = B64_SYMBOL; // RFC 4648
for(int ch='a';ch<='z';ch++){ base64array[ch] = B64_LOWERCASE; }
for(int ch='A';ch<='Z';ch++){ base64array[ch] = B64_UPPERCASE; }
for(int ch='0';ch<='9';ch++){ base64array[ch] = B64_NUMBER; }
......@@ -132,6 +142,8 @@ void scan_base64(const class scanner_params &sp,const recursion_control_block &r
if(sp.phase==scanner_params::PHASE_SCAN){
const sbuf_t &sbuf = sp.sbuf;
/* base64 is a newline followed by at least two lines of constant length,
* followed by an incomplete line ending with an equal sign.
* Lines can be termianted by \n or \r\n. This code simply ignores \r,
......@@ -150,7 +162,7 @@ void scan_base64(const class scanner_params &sp,const recursion_control_block &r
size_t line_len = 0; // length of the line
bool found_equal = false;
while(sbuf_getline(sbuf,pos,line_start,line_len)){
//fprintf(stderr,"pos=%zd\n",pos);
if(debug) fprintf(stderr,"BASE64 pos=%zd line_start=%zd line_len=%zd\n",pos,line_start,line_len);
if(sbuf_line_is_base64(sbuf,line_start,line_len,found_equal)){
if(inblock==false){
/* First line of a block! */
......@@ -163,8 +175,9 @@ void scan_base64(const class scanner_params &sp,const recursion_control_block &r
continue;
}
if(line_len!=prevlen){ // whoops! Lines are different lengths
if(found_equal && linecount>1){
//fprintf(stderr,"1. linecount=%zd\n",linecount);
// equal signs no longer required at end of BASE64 blocks
if(linecount>1){
if(debug) fprintf(stderr,"BASE64 1. linecount=%zd (%zd!=%zd) \n",linecount,line_len,prevlen);
process(sp,rcb,blockstart,pos-blockstart);
}
inblock=false;
......@@ -178,12 +191,11 @@ void scan_base64(const class scanner_params &sp,const recursion_control_block &r
* alignment issues.
*/
if(linecount>2 && inblock){
//fprintf(stderr,"2. blockstart=%zd line_start=%zd pos=%zd linecount=%zd\n",blockstart,line_start,pos,linecount);
if(debug) fprintf(stderr,"BASE64 2. blockstart=%zd line_start=%zd pos=%zd linecount=%zd\n",blockstart,line_start,pos,linecount);
process(sp,rcb,blockstart,pos-blockstart);
}
inblock = false;
}
}
//fprintf(stderr,"done\n");
}
}
......@@ -114,10 +114,16 @@ ELEV (-?[0-9]{1,6}[.][0-9]{0,3})
s.clear();
s.lat = gps_scanner::get_quoted_attrib(yytext,"lat");
s.lon = gps_scanner::get_quoted_attrib(yytext,"lon");
s.pos += yyleng;
s.pos += yyleng;
}
[<]/trkpt[>] {
gps_scanner &s = *yygps_get_extra(yyscanner);
s.clear();
s.pos += yyleng;
}
[<]ele[>]{ELEV}[<][/]ele[>] {
gps_scanner &s = *yygps_get_extra(yyscanner);
s.ele = gps_scanner::get_cdata(yytext);
......
/**
* scan_xml:
* Extracts text from XML files.
*
*/
#include "config.h"
#include "be13_api/bulk_extractor_i.h"
#include "image_process.h"
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <iomanip>
#include <cassert>
#define ZLIB_CONST
#ifdef HAVE_DIAGNOSTIC_UNDEF
# pragma GCC diagnostic ignored "-Wundef"
#endif
#ifdef HAVE_DIAGNOSTIC_CAST_QUAL
# pragma GCC diagnostic ignored "-Wcast-qual"
#endif
#include <zlib.h>
using namespace std;
static bool pdf_dump = false;
/*
* The problem with trying to extract text from PDF is that sometimes PDF splits actual
* things that we want, like (exampl) (le@co) (mpany.com).
* Other times it doesn't, but we don't want to combine because that will
* break thigs, like (email) (me) (at) (example@company.com).
*
* There's no good solution here without rendering the PDF file, and even that doesn't work
* all the time (witness has poor Adobe's extract text from PDF is.
*
* We could do both, but then there would need to be a way to distinguish the mode.
*
* So the approach that is used is to scan the entire block and see the largest chunk
* within (parentheses). If we find spaces within the parentheses, don't add spaces between
* them, otherwise do.
*
* Spaces are always added between arrays [foo].
* So we just put a space between them all and hope.
*/
extern "C"
void scan_msxml(const class scanner_params &sp,const recursion_control_block &rcb)
{
assert(sp.sp_version==scanner_params::CURRENT_SP_VERSION);
if(sp.phase==scanner_params::PHASE_STARTUP){
assert(sp.info->si_version==scanner_info::CURRENT_SI_VERSION);
sp.info->name = "msxml";
sp.info->author = "Simson Garfinkel";
sp.info->description = "Extracts text from Microsoft XML files";
sp.info->scanner_version= "1.0";
sp.info->flags = scanner_info::SCANNER_RECURSE;
sp.info->get_config("pdf_dump",&pdf_dump,"Dump the contents of PDF buffers");
return; /* No features recorded */
}
if(sp.phase==scanner_params::PHASE_SHUTDOWN) return;
if(sp.phase==scanner_params::PHASE_SCAN){
const sbuf_t &sbuf = sp.sbuf;
if (sbuf.substr(0,6)=="<?xml "){
/* copy out the data to a new buffer. The < character turns off copying the > character turns it on */
std::stringstream ss;
bool instring = false;
for(size_t i=0;i<sbuf.bufsize;i++){
switch(sbuf[i]){
case '<':
instring=false;
if(sbuf.substr(i,6)=="</w:p>"){
ss << "\n";
}
break;
case '>':
instring=true;
break;
default:
if(instring) ss << sbuf[i];
}
}
std::string bufstr = ss.str();
size_t buflen = bufstr.size();
managed_malloc<char *>buf(buflen);
if(buf.buf){
memcpy(buf.buf,bufstr.c_str(),buflen);
pos0_t pos0_xml = sbuf.pos0 + rcb.partName;
const sbuf_t sbuf_new(pos0_xml,reinterpret_cast<const u_char *>(buf.buf),buflen,buflen,false);
(*rcb.callback)(scanner_params(sp,sbuf_new));
}
}
}
}
#include "config.h"
#include "be13_api/bulk_extractor_i.h"
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <string.h>
#include <inttypes.h>
/*
* Scanner that pipes all data to stdin of a subprocess. This has advantages:
* - it's easy to write or reuse small standalone progams or plugins
* - you can write/script in any language you like
* - you don't need to compile your own bulk_extractor with dependencies
*
* and disadvantages:
* - you can't use bulk_extractors internal functionality to output data on standard form
* - you can't feed decoded output back into the process recursively
* - you can't accumulate information across blocks
*
*/
/* pipe_prog should usually point to an executable, or a script that looks like this:
* ---
* #!/bin/bash
* # set required variables, as necessary
* exec /path/to/my_program with arguments
* ---
*/
static char *const pipe_prog[] = {"./pipe_prog", NULL};
static char *const pipe_env[] = {"PATH=/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/usr/local/sbin", NULL};
static long long int bytesread;
int max(int a, int b)
{
if (a > b) return a;
return b;
}
extern "C"
void scan_pipe(const class scanner_params &sp,const recursion_control_block &rcb)
{
int child_stdin[2];
int child_stdout[2];
int child_stderr[2];
int childpid;
int cnt, ret, ret2;
unsigned int written;
assert(sp.version==canner_params::CURRENT_SP_VERSION);
if(sp.phase==scanner_params::startup){
assert(sp.info->si_version==scanner_info::CURRENT_SI_VERSION);
sp.info->name = "pipe";
sp.info->flags = scanner_info::SCANNER_DISABLED; // disabled for now
sp.info->feature_names.insert("pipe");
bytesread = 0;
return;
}
if(sp.phase==scanner_params::shutdown) {
// close...
//char _scan_pipe_buf[33];
//feature_recorder *pipe_recorder = sp.fs.get_name("pipe");
//snprintf(_scan_pipe_buf, 32, "Got %lli chars", bytesread);
//pipe_recorder->write(_scan_pipe_buf);
return;
}
if(sp.phase==scanner_params::scan){
// set up file descriptors for input and output, and exec command
bytesread += sp.sbuf.bufsize;
// pipe() returns 0 on success
if (pipe(child_stdin)) { perror("pipe() stdin"); return; }
if (pipe(child_stdout)) { perror("pipe() stdout"); return; }
if (pipe(child_stderr)) { perror("pipe() stderr"); return; }
childpid = fork();
if (childpid == -1) return;
if (childpid == 0) {
// child, set up final details and exec process
close(0); close(1); close(2);
dup2(child_stdin[0], 0); close(child_stdin[0]);
dup2(child_stdout[1], 1); close(child_stdout[1]);
dup2(child_stderr[1], 2); close(child_stderr[1]);
close(child_stdin[1]); close(child_stdout[0]); close(child_stderr[0]);
execve(pipe_prog[0], pipe_prog, pipe_env); // should never return
printf("execve error: %s: %s\n", pipe_prog[0], strerror(errno));
return;
} else {
// main process, write data to child_stdin[1] and read data from child_stdout[0] and child_stderr[0]
fd_set rfds, wfds, efds;
int maxfd;
char _scan_pipe_buf[BUFSIZ];
close(child_stdin[0]); close(child_stdout[1]); close(child_stderr[1]);
written = 0;
feature_recorder *pipe_recorder = sp.fs.get_name("pipe");
for (;;) {
ret2 = waitpid(childpid, &ret, WNOHANG);
if (ret2 == -1) { perror("waitpid"); return; }