Commit fc687888 authored by Sophie Brun's avatar Sophie Brun

Imported Upstream version 0.6.4

parent 223aacf2
......@@ -2,10 +2,10 @@
__description__ = 'pdf-parser, use it to parse a PDF document'
__author__ = 'Didier Stevens'
__version__ = '0.4.3'
__date__ = '2013/09/18'
__version__ = '0.6.4'
__date__ = '2015/08/12'
__minimum_python_version__ = (2, 5, 1)
__maximum_python_version__ = (3, 3, 0)
__maximum_python_version__ = (3, 4, 3)
"""
Source code put in public domain by Didier Stevens, no Copyright
......@@ -43,10 +43,21 @@ History:
2013/04/11: V0.4.2 modified PrettyPrintSub for strings with unprintable characters
2013/05/04: Added options searchstream, unfiltered, casesensitive, regex
2013/09/18: V0.4.3 fixed regression bug -w option
2014/09/25: V0.5.0 added option -g
2014/09/29: Added PrintGenerateObject and PrintOutputObject
2014/12/05: V0.6.0 Added YARA support
2014/12/09: cleanup, refactoring
2014/12/13: Python 3 fixes
2015/01/11: Added support for multiple YARA rule files; added request to search in trailer
2015/01/31: V0.6.1 Added optionyarastrings
2015/02/09: Added decoders
2015/04/05: V0.6.2 Added generateembedded
2015/04/06: fixed bug reported by Kurt for stream produced by Ghostscript where endstream is not preceded by whitespace; fixed prettyprint bug
2015/04/24: V0.6.3 when option dump's filename is -, content is dumped to stdout
2015/08/12: V0.6.4 option hash now also calculates hashes of streams when selecting or searching objects; and displays hexasciidump first line
Todo:
- handle printf todo
- fix PrettyPrint
- support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D.pdf.vir
"""
......@@ -58,6 +69,8 @@ import binascii
import hashlib
import sys
import zipfile
import time
import os
if sys.version_info[0] >= 3:
from io import StringIO
import urllib.request
......@@ -66,6 +79,10 @@ else:
from cStringIO import StringIO
import urllib2
urllib23 = urllib2
try:
import yara
except:
pass
CHAR_WHITESPACE = 1
CHAR_DELIMITER = 2
......@@ -83,6 +100,8 @@ PDF_ELEMENT_TRAILER = 4
PDF_ELEMENT_STARTXREF = 5
PDF_ELEMENT_MALFORMED = 6
dumplinelength = 16
#Convert 2 Bytes If Python 3
def C2BIP3(string):
if sys.version_info[0] > 2:
......@@ -90,6 +109,27 @@ def C2BIP3(string):
else:
return string
# CIC: Call If Callable
def CIC(expression):
if callable(expression):
return expression()
else:
return expression
# IFF: IF Function
def IFF(expression, valueTrue, valueFalse):
if expression:
return CIC(valueTrue)
else:
return CIC(valueFalse)
def Timestamp(epoch=None):
if epoch == None:
localTime = time.localtime()
else:
localTime = time.localtime(epoch)
return '%04d%02d%02d-%02d%02d%02d' % localTime[0:6]
def CopyWithoutWhiteSpace(content):
result = []
for token in content:
......@@ -367,6 +407,15 @@ class cPDFElementTrailer:
self.type = PDF_ELEMENT_TRAILER
self.content = content
def Contains(self, keyword):
data = ''
for i in range(0, len(self.content)):
if self.content[i][1] == 'stream':
break
else:
data += Canonicalize(self.content[i][1])
return data.upper().find(keyword.upper()) != -1
def IIf(expr, truepart, falsepart):
if expr:
return truepart
......@@ -379,6 +428,22 @@ class cPDFElementIndirectObject:
self.id = id
self.version = version
self.content = content
#fix stream for Ghostscript bug reported by Kurt
if self.ContainsStream():
position = len(self.content) - 1
if position < 0:
return
while self.content[position][0] == CHAR_WHITESPACE and position >= 0:
position -= 1
if position < 0:
return
if self.content[position][0] != CHAR_REGULAR:
return
if self.content[position][1] == 'endstream':
return
if not self.content[position][1].endswith('endstream'):
return
self.content = self.content[0:position] + [(self.content[position][0], self.content[position][1][:-len('endstream')])] + [(self.content[position][0], 'endstream')] + self.content[position+1:]
def GetType(self):
content = CopyWithoutWhiteSpace(self.content)
......@@ -484,7 +549,7 @@ class cPDFElementIndirectObject:
if EqualCanonical(filter, '/FlateDecode') or EqualCanonical(filter, '/Fl'):
try:
data = FlateDecode(data)
except zlib.error, e:
except zlib.error as e:
message = 'FlateDecode decompress failed'
if len(data) > 0 and ord(data[0]) & 0x0F != 8:
message += ', unexpected compression method: %02x' % ord(data[0])
......@@ -518,6 +583,30 @@ class cPDFElementIndirectObject:
else:
return data
def StreamYARAMatch(self, rules, decoders, decoderoptions, filter):
if not self.ContainsStream():
return None
streamData = self.Stream(filter)
if filter and streamData == 'No filters':
streamData = self.Stream(False)
oDecoders = [cIdentity(streamData, None)]
for cDecoder in decoders:
try:
oDecoder = cDecoder(streamData, decoderoptions)
oDecoders.append(oDecoder)
except Exception as e:
print('Error instantiating decoder: %s' % cDecoder.name)
raise e
results = []
for oDecoder in oDecoders:
while oDecoder.Available():
yaraResults = rules.match(data=oDecoder.Decode())
if yaraResults != []:
results.append([oDecoder.Name(), yaraResults])
return results
class cPDFElementStartxref:
def __init__(self, index):
self.type = PDF_ELEMENT_STARTXREF
......@@ -602,7 +691,7 @@ class cPDFParseDictionary:
value.append(ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput))
tokens = tokens[1:]
def retrieve(self):
def Retrieve(self):
return self.parsed
def PrettyPrintSub(self, prefix, dictionary):
......@@ -612,7 +701,11 @@ class cPDFParseDictionary:
if e[1] == []:
print('%s %s' % (prefix, e[0]))
elif type(e[1][0]) == type(''):
value = ''.join(e[1]).strip()
if len(e[1]) == 3 and IsNumeric(e[1][0]) and e[1][1] == '0' and e[1][2] == 'R':
joiner = ' '
else:
joiner = ''
value = joiner.join(e[1]).strip()
reprValue = repr(value)
if "'" + value + "'" != reprValue:
value = reprValue
......@@ -625,6 +718,12 @@ class cPDFParseDictionary:
def PrettyPrint(self, prefix):
self.PrettyPrintSub(prefix, self.parsed)
def Get(self, select):
for key, value in self.parsed:
if key == select:
return value
return None
def FormatOutput(data, raw):
if raw:
if type(data) == type([]):
......@@ -634,7 +733,33 @@ def FormatOutput(data, raw):
else:
return repr(data)
def PrintObject(object, options):
#Fix for http://bugs.python.org/issue11395
def StdoutWriteChunked(data):
if sys.version_info[0] > 2:
sys.stdout.buffer.write(data)
else:
while data != '':
sys.stdout.write(data[0:10000])
try:
sys.stdout.flush()
except IOError:
return
data = data[10000:]
def IfWIN32SetBinary(io):
if sys.platform == 'win32':
import msvcrt
msvcrt.setmode(io.fileno(), os.O_BINARY)
def PrintOutputObject(object, options):
if options.dump == '-':
filtered = object.Stream(options.filter == True)
if filtered == []:
filtered = ''
IfWIN32SetBinary(sys.stdout)
StdoutWriteChunked(filtered)
return
print('obj %d %d' % (object.id, object.version))
print(' Type: %s' % ConditionalCanonicalize(object.GetType(), options.nocanonicalizedoutput))
print(' Referencing: %s' % ', '.join(map(lambda x: '%s %s %s' % x, object.GetReferences())))
......@@ -645,6 +770,16 @@ def PrintObject(object, options):
if options.debug:
print(' %s' % FormatOutput(dataPrecedingStream, options.raw))
oPDFParseDictionary = cPDFParseDictionary(dataPrecedingStream, options.nocanonicalizedoutput)
if options.hash:
streamContent = object.Stream(False)
print(' unfiltered')
print(' len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest()))
print(' %s' % HexAsciiDumpLine(streamContent))
streamContent = object.Stream(True)
print(' filtered')
print(' len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest()))
print(' %s' % HexAsciiDumpLine(streamContent))
streamContent = None
else:
if options.debug or options.raw:
print(' %s' % FormatOutput(object.content, options.raw))
......@@ -742,7 +877,7 @@ def ASCIIHexDecode(data):
return binascii.unhexlify(''.join([c for c in data if c not in ' \t\n\r']).rstrip('>'))
def FlateDecode(data):
return zlib.decompress(data)
return zlib.decompress(C2BIP3(data))
def RunLengthDecode(data):
f = StringIO(data)
......@@ -845,10 +980,172 @@ class LZWDecoder(object):
def LZWDecode(data):
return ''.join(LZWDecoder(StringIO(data)).run())
def PrintGenerateObject(object, options, newId=None):
if newId == None:
objectId = object.id
else:
objectId = newId
dataPrecedingStream = object.ContainsStream()
if dataPrecedingStream:
if options.filter:
decompressed = object.Stream(True)
if decompressed == 'No filters' or decompressed.startswith('Unsupported filter: '):
print(' oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip())))
else:
dictionary = FormatOutput(dataPrecedingStream, True)
dictionary = re.sub(r'/Length\s+\d+', '', dictionary)
dictionary = re.sub(r'/Filter\s*/[a-zA-Z0-9]+', '', dictionary)
dictionary = re.sub(r'/Filter\s*\[.+\]', '', dictionary)
dictionary = re.sub(r'^\s*<<', '', dictionary)
dictionary = re.sub(r'>>\s*$', '', dictionary)
dictionary = dictionary.strip()
print(" oPDF.stream2(%d, %d, %s, %s, 'f')" % (objectId, object.version, repr(decompressed.rstrip()), repr(dictionary)))
else:
print(' oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip())))
else:
print(' oPDF.indirectobject(%d, %d, %s)' % (objectId, object.version, repr(FormatOutput(object.content, True).strip())))
def PrintObject(object, options):
if options.generate:
PrintGenerateObject(object, options)
else:
PrintOutputObject(object, options)
def File2Strings(filename):
try:
f = open(filename, 'r')
except:
return None
try:
return map(lambda line:line.rstrip('\n'), f.readlines())
except:
return None
finally:
f.close()
def ProcessAt(argument):
if argument.startswith('@'):
strings = File2Strings(argument[1:])
if strings == None:
raise Exception('Error reading %s' % argument)
else:
return strings
else:
return [argument]
def YARACompile(fileordirname):
dFilepaths = {}
if os.path.isdir(fileordirname):
for root, dirs, files in os.walk(fileordirname):
for file in files:
filename = os.path.join(root, file)
dFilepaths[filename] = filename
else:
for filename in ProcessAt(fileordirname):
dFilepaths[filename] = filename
return yara.compile(filepaths=dFilepaths)
def AddDecoder(cClass):
global decoders
decoders.append(cClass)
class cDecoderParent():
pass
def LoadDecoders(decoders, verbose):
if decoders == '':
return
scriptPath = os.path.dirname(sys.argv[0])
for decoder in sum(map(ProcessAt, decoders.split(',')), []):
try:
if not decoder.lower().endswith('.py'):
decoder += '.py'
if os.path.dirname(decoder) == '':
if not os.path.exists(decoder):
scriptDecoder = os.path.join(scriptPath, decoder)
if os.path.exists(scriptDecoder):
decoder = scriptDecoder
exec(open(decoder, 'r').read(), globals(), globals())
except Exception as e:
print('Error loading decoder: %s' % decoder)
if verbose:
raise e
class cIdentity(cDecoderParent):
name = 'Identity function decoder'
def __init__(self, stream, options):
self.stream = stream
self.options = options
self.available = True
def Available(self):
return self.available
def Decode(self):
self.available = False
return self.stream
def Name(self):
return ''
def DecodeFunction(decoders, options, stream):
if decoders == []:
return stream
return decoders[0](stream, options.decoderoptions).Decode()
class cDumpStream():
def __init__(self):
self.text = ''
def Addline(self, line):
if line != '':
self.text += line + '\n'
def Content(self):
return self.text
def HexDump(data):
oDumpStream = cDumpStream()
hexDump = ''
for i, b in enumerate(data):
if i % dumplinelength == 0 and hexDump != '':
oDumpStream.Addline(hexDump)
hexDump = ''
hexDump += IFF(hexDump == '', '', ' ') + '%02X' % ord(b)
oDumpStream.Addline(hexDump)
return oDumpStream.Content()
def CombineHexAscii(hexDump, asciiDump):
if hexDump == '':
return ''
return hexDump + ' ' + (' ' * (3 * (dumplinelength - len(asciiDump)))) + asciiDump
def HexAsciiDump(data):
oDumpStream = cDumpStream()
hexDump = ''
asciiDump = ''
for i, b in enumerate(data):
if i % dumplinelength == 0:
if hexDump != '':
oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump))
hexDump = '%08X:' % i
asciiDump = ''
hexDump+= ' %02X' % ord(b)
asciiDump += IFF(ord(b) >= 32, b, '.')
oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump))
return oDumpStream.Content()
def HexAsciiDumpLine(data):
return HexAsciiDump(data[0:16])[10:-1]
def Main():
"""pdf-parser, use it to parse a PDF document
"""
global decoders
oParser = optparse.OptionParser(usage='usage: %prog [options] pdf-file|zip-file|url\n' + __description__, version='%prog ' + __version__)
oParser.add_option('-s', '--search', help='string to search in indirect objects (except streams)')
oParser.add_option('-f', '--filter', action='store_true', default=False, help='pass stream object through filters (FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode and RunLengthDecode only)')
......@@ -869,6 +1166,12 @@ def Main():
oParser.add_option('--unfiltered', action='store_true', default=False, help='search in unfiltered streams')
oParser.add_option('--casesensitive', action='store_true', default=False, help='case sensitive search in streams')
oParser.add_option('--regex', action='store_true', default=False, help='use regex to search in streams')
oParser.add_option('-g', '--generate', action='store_true', default=False, help='generate a Python program that creates the parsed PDF file')
oParser.add_option('--generateembedded', type=int, default=0, help='generate a Python program that embeds the selected indirect object as a file')
oParser.add_option('-y', '--yara', help='YARA rule (or directory or @file) to check streams (can be used with option --unfiltered)')
oParser.add_option('--yarastrings', action='store_true', default=False, help='Print YARA strings')
oParser.add_option('--decoders', type=str, default='', help='decoders to load (separate decoders with a comma , ; @file supported)')
oParser.add_option('--decoderoptions', type=str, default='', help='options for the decoder')
(options, args) = oParser.parse_args()
if len(args) != 1:
......@@ -880,6 +1183,9 @@ def Main():
print(' https://DidierStevens.com')
else:
decoders = []
LoadDecoders(options.decoders, True)
oPDFParser = cPDFParser(args[0], options.verbose, options.extract)
cntComment = 0
cntXref = 0
......@@ -915,12 +1221,55 @@ def Main():
selectXref = True
selectTrailer = True
selectStartXref = True
if options.search:
selectTrailer = True
if options.type == '-':
optionsType = ''
else:
optionsType = options.type
if options.generate or options.generateembedded != 0:
savedRoot = ['1', '0', 'R']
print('#!/usr/bin/python')
print('')
print('"""')
print('')
print('Program generated by pdf-parser.py by Didier Stevens')
print('https://DidierStevens.com')
print('Use at your own risk')
print('')
print('Input PDF file: %s' % args[0])
print('This Python program was created on: %s' % Timestamp())
print('')
print('"""')
print('')
print('import mPDF')
print('import sys')
print('')
print('def Main():')
print(' if len(sys.argv) != 2:')
print(" print('Usage: %s pdf-file' % sys.argv[0])")
print(' return')
print(' oPDF = mPDF.cPDF(sys.argv[1])')
if options.generateembedded != 0:
print(" oPDF.header('1.1')")
print(r" oPDF.comment('\xd0\xd0\xd0\xd0')")
print(r" oPDF.indirectobject(1, 0, '<<\r\n /Type /Catalog\r\n /Outlines 2 0 R\r\n /Pages 3 0 R\r\n /Names << /EmbeddedFiles << /Names [(test.bin) 7 0 R] >> >>\r\n>>')")
print(r" oPDF.indirectobject(2, 0, '<<\r\n /Type /Outlines\r\n /Count 0\r\n>>')")
print(r" oPDF.indirectobject(3, 0, '<<\r\n /Type /Pages\r\n /Kids [4 0 R]\r\n /Count 1\r\n>>')")
print(r" oPDF.indirectobject(4, 0, '<<\r\n /Type /Page\r\n /Parent 3 0 R\r\n /MediaBox [0 0 612 792]\r\n /Contents 5 0 R\r\n /Resources <<\r\n /ProcSet [/PDF /Text]\r\n /Font << /F1 6 0 R >>\r\n >>\r\n>>')")
print(r" oPDF.stream(5, 0, 'BT /F1 12 Tf 70 700 Td 15 TL (This PDF document embeds file test.bin) Tj ET', '<< /Length %d >>')")
print(r" oPDF.indirectobject(6, 0, '<<\r\n /Type /Font\r\n /Subtype /Type1\r\n /Name /F1\r\n /BaseFont /Helvetica\r\n /Encoding /MacRomanEncoding\r\n>>')")
print(r" oPDF.indirectobject(7, 0, '<<\r\n /Type /Filespec\r\n /F (test.bin)\r\n /EF << /F 8 0 R >>\r\n>>')")
if options.yara != None:
if not 'yara' in sys.modules:
print('Error: option yara requires the YARA Python module.')
return
rules = YARACompile(options.yara)
while True:
object = oPDFParser.GetObject()
if object != None:
......@@ -935,16 +1284,24 @@ def Main():
cntStartXref += 1
elif object.type == PDF_ELEMENT_INDIRECT_OBJECT:
cntIndirectObject += 1
type = object.GetType()
if not type in dicObjectTypes:
dicObjectTypes[type] = [object.id]
type1 = object.GetType()
if not type1 in dicObjectTypes:
dicObjectTypes[type1] = [object.id]
else:
dicObjectTypes[type].append(object.id)
dicObjectTypes[type1].append(object.id)
else:
if object.type == PDF_ELEMENT_COMMENT and selectComment:
if options.generate:
comment = object.comment[1:].rstrip()
if re.match('PDF-\d\.\d', comment):
print(" oPDF.header('%s')" % comment[4:])
elif comment != '%EOF':
print(' oPDF.comment(%s)' % repr(comment))
elif options.yara == None and options.generateembedded == 0:
print('PDF Comment %s' % FormatOutput(object.comment, options.raw))
print('')
elif object.type == PDF_ELEMENT_XREF and selectXref:
if not options.generate and options.yara == None and options.generateembedded == 0:
if options.debug:
print('xref %s' % FormatOutput(object.content, options.raw))
else:
......@@ -952,6 +1309,12 @@ def Main():
print('')
elif object.type == PDF_ELEMENT_TRAILER and selectTrailer:
oPDFParseDictionary = cPDFParseDictionary(object.content[1:], options.nocanonicalizedoutput)
if options.generate:
result = oPDFParseDictionary.Get('/Root')
if result != None:
savedRoot = result
elif options.yara == None and options.generateembedded == 0:
if not options.search or options.search and object.Contains(options.search):
if oPDFParseDictionary == None:
print('trailer %s' % FormatOutput(object.content, options.raw))
else:
......@@ -959,6 +1322,7 @@ def Main():
oPDFParseDictionary.PrettyPrint(' ')
print('')
elif object.type == PDF_ELEMENT_STARTXREF and selectStartXref:
if not options.generate and options.yara == None and options.generateembedded == 0:
print('startxref %d' % object.index)
print('')
elif object.type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject:
......@@ -982,6 +1346,21 @@ def Main():
elif options.searchstream:
if object.StreamContains(options.searchstream, not options.unfiltered, options.casesensitive, options.regex):
PrintObject(object, options)
elif options.yara != None:
results = object.StreamYARAMatch(rules, decoders, options.decoderoptions, not options.unfiltered)
if results != None and results != []:
for result in results:
for yaraResult in result[1]:
print('YARA rule%s: %s (%s)' % (IFF(result[0] == '', '', ' (stream decoder: %s)' % result[0]), yaraResult.rule, yaraResult.namespace))
if options.yarastrings:
for stringdata in yaraResult.strings:
print('%06x %s:' % (stringdata[0], stringdata[1]))
print(' %s' % binascii.hexlify(C2BIP3(stringdata[2])))
print(' %s' % repr(stringdata[2]))
PrintObject(object, options)
elif options.generateembedded != 0:
if object.id == options.generateembedded:
PrintGenerateObject(object, options, 8)
else:
PrintObject(object, options)
elif object.type == PDF_ELEMENT_MALFORMED:
......@@ -1008,6 +1387,13 @@ def Main():
for key in names:
print(' %s %d: %s' % (key, len(dicObjectTypes[key]), ', '.join(map(lambda x: '%d' % x, dicObjectTypes[key]))))
if options.generate or options.generateembedded != 0:
print(" oPDF.xrefAndTrailer('%s')" % ' '.join(savedRoot))
print('')
print("if __name__ == '__main__':")
print(' Main()')
def TestPythonVersion(enforceMaximumVersion=False, enforceMinimumVersion=False):
if sys.version_info[0:3] > __maximum_python_version__:
if enforceMaximumVersion:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment