Files
blender-addons-contrib/io_vector/pdf.py
Campbell Barton 9406de5f31 Cleanup: remove <pep8 compliant> comment
This is no longer necessary, see: T98554.
2022-06-03 11:52:13 +10:00

916 lines
29 KiB
Python

# ##### BEGIN GPL LICENSE BLOCK #####
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# ##### END GPL LICENSE BLOCK #####
"""Functions for dealing with PDF files.
"""
__author__ = "howard.trickey@gmail.com"
import re
import sys
try:
import zlib
except ImportError:
zlib = None
# Python 2 and 3 differ in the type of result of indexing into
# a 'bytes' string, so define a function that returns the
# integer value regardless of the python version.
if sys.version_info[0] == 3:
def ordat(b, i):
return b[i]
else:
def ordat(b, i):
return ord(b[i])
WARN = True # print Warnings about strange things?
# PDF objects
OBOOL = 0
ONUM = 1
OSTRING = 2
ONAME = 3
OARRAY = 4
ODICT = 5
OSTREAM = 6
ONULL = 7
OINDIRECTDEF = 8
OINDIRECTREF = 9
_re_psbool = re.compile(br'true|false')
_re_psint = re.compile(br'(\+|-)?[0-9]+')
_re_psreal = re.compile(br'((\+|-)?([0-9]+\.[0-9]*)|(\.[0-9]+))')
_re_psstring = re.compile(br'\((\\.|.)*?\)')
_re_pshexstring = re.compile(br'<[0-9A-Fa-f]*>')
_re_psname = re.compile(br'/([^\0\t\n\f\r \(\)<>[\]{}/%]*)')
_re_psnull = re.compile(br'null')
_re_pskeyword = re.compile(br'[A-Za-z]+')
_re_psstreameol = re.compile(br'\r\n|\n')
_re_psendstream = re.compile(br'endstream')
_re_pseol = re.compile(br'(\r\n|\n|\r)')
_re_pswhitespaceandcomments = re.compile(br'([\0\t\n\f\r ]|%[^\n\r]*[\n\r]+)*')
# Object Notes:
# The string re is not really right - the spec allows
# balanced parentheses to appear in a string unescaped.
# Also, strings need to convert the following escapes:
# \n \r \t \b \f \( \) \\ \ddd (octal).
# Octal chars ddd can have 1, 2, or 3 octal digits with high order
# overflow ignored and leading zeros as needed (but not required,
# needed only if a normal digit follows the octal one).
#
# Hex strings may have odd number of digits - the last is assumed to be 0
#
# The token / (with no regular characters after it) is a valid name
# Names may include arbitrary characters by writing its 2-digit hex
# code preceded by a '#'.
#
# Array objects: sequence of objects enclosed in [ and ]
#
# Dictionary objects: sequence of key-value pairs enclosed in << and >>
# The keys must be names. A dictionary entry whose value is null is
# equivalent to an absent entry.
# By convention, dictonary objects have a value with key /Type that
# identifies the type of dictionary. Sometimes there is a subtype,
# with key /Subtype or /S. Values of type and subtype are always names.
#
# Stream objects: a dictionary followed by zero or more lines of bytes
# bracketed by keywords "stream" and "endstream". All streams must be
# indirect objects, and the dictionary must be a direct object.
#
# An Indirect Object Definition: two ints (object #, generation #)
# followed by the object bracketed by "obj" and "endobj"
#
# An Indirect Object Reference: two ints (object #, generation #) followed
# by "R".
# An indirect object reference to an object not defined in the file
# is taken as a reference to the null object.
def GetPDFObject(s, i):
"""Get one complete object starting at s[i].
Args:
s: bytes holding contents of a PDF file
i: index into s
Returns:
((objectid, value), int) - where int is i after the object
use None instead of (objectid, value) if
there are no more objects in s
"""
m = _re_pswhitespaceandcomments.match(s, i)
if m:
i = m.end()
if i == len(s):
return (None, i)
m = _re_psname.match(s, i)
if m:
return ((ONAME, s[m.start() + 1:m.end()].decode()), m.end())
m = _re_psreal.match(s, i)
if m:
return ((ONUM, float(m.group())), m.end())
m = _re_psint.match(s, i)
if m:
# could also be start of indirect object def or ref
(o, j) = GetPDFIndirectObjectRefOrDef(s, i)
if o is not None:
return (o, j)
else:
return ((ONUM, int(m.group())), m.end())
m = _re_psbool.match(s, i)
if m:
if m.group == b'true':
v = True
else:
v = False
return ((OBOOL, v), m.end())
m = _re_psnull.match(s, i)
if m:
return ((ONULL, None), m.end())
m = _re_psstring.match(s, i)
if m:
return GetPDFLiteralString(s, i)
m = _re_pshexstring.match(s, i)
if m:
return GetPDFHexString(s, i, m.end())
c = ordat(s, i)
if c == ord('['):
return GetPDFArray(s, i)
elif c == ord('<') and i < len(s) - 1 and ordat(s, i + 1) == ord('<'):
(o, j) = GetPDFDict(s, i)
# check if followed by stream
(w, k) = GetPDFKeyword(s, j)
if w == b'stream':
m = _re_psstreameol.match(s, k)
if m:
streamstart = m.end()
streamend = s.find(b'endstream', streamstart)
if streamend > 0:
# just return byte offsets in s where stream
# contents start and (most probably) end
return ((OSTREAM, (o[1], streamstart, streamend)),
streamend + 9)
return (o, j)
return (None, i + 1)
def GetPDFIndirectObjectRefOrDef(s, i):
"""Get an indirect object def or ref starting at s[i].
Args:
s: bytes holding contents of a PDF file
i: index into s
Returns:
((OINDIRECTDEF, (obj#,gen#,obj)), newi)
or ((OINDIRECTREF, (obj#, gen#)), newi)
or (None, i)
"""
(v, j) = GetPDFTwoInts(s, i)
if v is None:
return (None, i)
(obj_number, gen_number) = v
(w, j) = GetPDFKeyword(s, j)
if w == b'R':
return ((OINDIRECTREF, (obj_number, gen_number)), j)
elif w == b'obj':
(obj, j) = GetPDFObject(s, j)
if obj is not None:
(w, j) = GetPDFKeyword(s, j)
if w == b'endobj':
return ((OINDIRECTDEF, (obj_number, gen_number, obj)), j)
return (None, i)
def GetPDFTwoInts(s, i):
"""If there are two ints starting at s[i], return them, else return None.
Args:
s: PDF file contents
i: index into s
Returns:
((int, int), newi) or (None, i)
"""
m = _re_pswhitespaceandcomments.match(s, i)
j = i
if m:
j = m.end()
if j == len(s):
return (None, i)
m = _re_psint.match(s, j)
if m:
a = int(m.group())
j = m.end()
m = _re_pswhitespaceandcomments.match(s, j)
if m:
j = m.end()
if j == len(s):
return (None, i)
m = _re_psint.match(s, j)
if m:
b = int(m.group())
return ((a, b), m.end())
return (None, i)
def GetPDFKeyword(s, i):
"""Get a keyword (alphabetic chars, as byte string) starting at s[i].
If there is not a keyword there, just return (b'',i).
Args:
s: bytes holding contents of a PDF file
i: index into s
Returns:
(keyword, newi) - keyword will be b'' if couldn't find one
"""
m = _re_pswhitespaceandcomments.match(s, i)
j = i
if m:
j = m.end()
if j == len(s):
return (b'', i)
m = _re_pskeyword.match(s, j)
if m:
return (m.group(), m.end())
return (b'', i)
def GetPDFLiteralString(s, i):
"""Convert and return object for pdf literal string starting at s[i]."""
j = i + 1
balen = 0
v = []
while j < len(s):
c = ordat(s, j)
if c == ord(')'):
if balen == 0:
return ((OSTRING, ''.join(v)), j + 1)
else:
balen -= 1
elif c == ord('('):
balen += 1
elif c == ord('\\'):
j += 1
if j == len(s):
if WARN:
print("unterminated string at", i)
return ((OSTRING, ''.join(v)), j)
c = ordat(s, j)
if c == ord('n'):
v += '\n'
elif c == ord('f'):
v += '\f'
elif c == ord('r'):
v += '\r'
elif c == ord('t'):
v += '\t'
elif ord('0') <= c <= ord('7'):
x = c - ord('0')
j += 1
if j < len(s):
c = ordat(s, j)
if ord('0') <= c <= ord('7'):
x = x * 8 + c - ord('0')
j += 1
if j < len(s):
c = ordat(s, j)
if ord('0') <= c <= ord('7'):
x = x * 8 + c - ord('0')
if x >= 256:
x %= 256
v += chr(x)
else:
m = _re_pseol.match(s, j)
if m:
# backslash used for line continuation
j = m.end() - 1 # -1 to compensate for +1 that will happen
else:
v += chr(c)
else:
m = _re_pseol.match(s, j)
if m:
v += '\n'
j = m.end() - 1 # -1 to compensate for +1
else:
v += chr(c)
j += 1
if WARN:
print('unterminated string at', i)
return ((OSTRING, ''.join(v)), j)
def GetPDFHexString(s, i, iend):
"""Convert and return pdf hex string starting at s[i],
ending at s[iend-1]."""
j = i + 1
v = []
c = ''
jend = iend - 1
while j < jend:
p = _re_pswhitespaceandcomments.match(s, j)
if p:
j = p.end()
d = chr(ordat(s, j))
if c != '':
v.append(FromHexPair(c, d))
c = ''
else:
c = d
j += 1
if c != '':
v.append(FromHexPair(c, '0'))
return ((OSTRING, ''.join(v)), iend)
def FromHexPair(a, b):
"""Interpret string a+b as hex pair, and return the pair's value."""
try:
v = int(a + b, 16)
except TypeError:
if WARN:
print('funny hex pair', a + b)
v = 0
return chr(v)
def GetPDFArray(s, i):
"""Convert and return object array starting at s[i]."""
j = i + 1
v = []
while j < len(s):
m = _re_pswhitespaceandcomments.match(s, j)
if m:
j = m.end()
if j == len(s):
break
if ordat(s, j) == ord(']'):
return ((OARRAY, v), j + 1)
(o, j) = GetPDFObject(s, j)
if o is None:
break
v.append(o)
if WARN:
print('unterminated array starting at', i)
return ((OARRAY, v), j)
def GetPDFDict(s, i):
"""Convert and return object dict starting at s[i]."""
j = i + 2
v = {}
while j < len(s):
m = _re_pswhitespaceandcomments.match(s, j)
if m:
j = m.end()
if j == len(s):
break
if ordat(s, j) == ord('>') and ordat(s, j + 1) == ord('>'):
return ((ODICT, v), j + 2)
(o, j) = GetPDFObject(s, j)
if o is None:
break
if o[0] != ONAME:
if WARN:
print('expected name at', i)
break
name = o[1]
(o, j) = GetPDFObject(s, j)
if o is None:
break
v[name] = o
if WARN:
print('unterminated dict starting at', i)
return ((ODICT, v), j)
# Crossref dict:
# Cross references are a way of turning an (object #, generation #) into
# an actual object in the file, when and indirect reference of the form
# object# generation# R
# is found in another object.
# Cross references are of two types:
# 1) uncompressed: you find the object at a specified byte offset in the file
# 2) compressed: you find the object in an object stream which is in turn found
# by looking for a specified object# with implicit generation 0.
# We will build a map from (object#, generation#) to a tuple
# (kind, field2, field3)
# where if kind==XUNCOMPRESSED then field2 is the file byte offset of the object and field2
# is its generation #
# and if kind==XCOMPRESSED then field2 is the object # of the object stream containing it,
# and field3 is the index of that object within the stream
XUNCOMPRESSED = 1
XCOMPRESSED = 2
def GetPDFTrailerAndCrossrefs(s):
"""Find and return the (last) PDF trailer dictionary and cross reference
dict.
Args:
s: PDF file (as bytes)
Returns:
(trailer dict, crossref dict)
"""
startxrefi = s.rfind(b'startxref')
if startxrefi == -1:
if WARN:
print('cannot find startxref')
return (None, None)
crossrefi = -1
m = _re_pseol.match(s, startxrefi + 9)
if m:
m2 = _re_psint.match(s, m.end())
if m2:
crossrefi = int(m2.group())
if crossrefi <= 0:
if WARN:
print('cannot find crossref index')
return (None, None)
crossrefs = {}
d = None
last_trailerdict = None
if s[crossrefi:crossrefi+4] != b'xref':
# Could be Crossref stream
(obj, j) = GetPDFObject(s, crossrefi)
if PDFObjHasType(obj, OINDIRECTDEF):
strobj = obj[1][2]
if PDFObjHasType(strobj, OSTREAM):
strxrefs = GetPDFStreamContents(strobj, s, {}, False)
if strxrefs is None:
if WARN:
print('cannot decode crossref stream')
return (None, {})
d = strobj[1][0]
w = GetTypedValFromDictEntry(d, 'W', OARRAY, s, {})
ty = GetTypedValFromDictEntry(d, 'Type', ONAME, s, {})
sz = GetTypedValFromDictEntry(d, 'Size', ONUM, s, {})
index = GetTypedValFromDictEntry(d, 'Index', OARRAY, s, {})
prev = GetTypedValFromDictEntry(d, 'Prev', ONUM, s, {})
if ty != 'XRef' or sz is None or w is None:
if WARN:
print('something wrong with XRef stream dictionary')
return (None, {})
n1 = w[0][1]
n2 = w[1][1]
n3 = w[2][1]
ntot = n1 + n2 + n3
firstobjnum = 0
numobjs = sz
if index is not None:
firstobjnum = index[0][1]
numobjs = index[1][1]
k = 0
objnum = firstobjnum
while k + ntot <= len(strxrefs):
if n1 == 0:
f1 = 1
else:
(f1, k) = GetPDFMultiByteInt(strxrefs, k, n1)
(f2, k) = GetPDFMultiByteInt(strxrefs, k, n2)
if n3 == 0:
f3 = 0
else:
(f3, k) = GetPDFMultiByteInt(strxrefs, k, n3)
if f1 == 1:
crossrefs[(objnum, f3)] = (XUNCOMPRESSED, f2, f3)
elif f1 == 2:
crossrefs[(objnum, 0)] = (XCOMPRESSED, f2, f3)
elif f1 != 0:
if WARN:
print('unexpected type in XRef:', f1)
return (None, {})
objnum += 1
else:
if WARN:
print("no xref and object there is not stream")
print (obj)
else:
if WARN:
print("no xref and not indirect def")
print(obj)
return (d, crossrefs)
while crossrefi > 0:
i = crossrefi
if s[i:i + 4] != b'xref':
if WARN:
print('cannot find xref')
break
m = _re_pseol.match(s, i + 4)
if m:
i = m.end()
while i < startxrefi:
# Get start of subsection
(v, i) = GetPDFTwoInts(s, i)
if v is None:
break
(idstart, nentries) = v
m = _re_pswhitespaceandcomments.match(s, i)
if m:
i = m.end()
for k in range(idstart, idstart + nentries):
byteoffset = int(s[i:i + 10])
gen = int(s[i + 11:i + 16])
inuse = (ordat(s, i + 17) == ord('n'))
if inuse:
crossrefs[(k, gen)] = (XUNCOMPRESSED, byteoffset, gen)
i += 20
# Should be at 'trailer' now
(w, i) = GetPDFKeyword(s, i)
if w != b'trailer':
if WARN:
print('cannot find trailer')
break
(trailero, i) = GetPDFObject(s, i)
if trailero is None or trailero[0] != ODICT:
if WARN:
print('cannot find trailer dict')
break
trailerdict = trailero[1]
if last_trailerdict is None:
last_trailerdict = trailerdict
if 'Prev' in trailerdict:
crossrefi = GetTypedValFromDictEntry(trailerdict, 'Prev', ONUM, s, crossrefs)
if crossrefi is None:
crossrefi = -1
else:
crossrefi = -1
return (last_trailerdict, crossrefs)
def GetPDFMultiByteInt(s, i, fieldlen):
"""Get a multibyte int from a string of bytes
Args:
s: string of bytes
i: int, offset in s to start getting the result
fieldlen: int, how many bytes to get
Returns:
int: accumulated multibyte value (high order byte first in s)
"""
ans = 0
for k in range(i, i + fieldlen):
ans = ans * 256 + ord(s[k])
return (ans, i + fieldlen)
def ReadPDFPageOneContents(filename):
"""Read a PDF file and return Content string for its first page.
Args:
filename: name of file
Returns:
string: Content string for first page
"""
try:
f = open(filename, "rb") # binary since some parts may be compressed
except IOError:
if WARN:
print("Can't open file", filename)
return ''
contents = f.read()
f.close()
return GetPDFPageOneContents(contents)
def GetPDFPageOneContents(s):
"""Find and return first page in PDF file, given as string.
First get the last trailer's dictionary, which should contain
the Root object, and also the crossref dictionary which gives
byte offsets for all indirect objects.
Then from Root object, find Pages object (a page tree), and
follow leftmost Kid until get to a leaf Page object, which
in turn has the desired Contents object, which is a stream
or an array of streams. Decompress (if necessary) the
stream(s) and return their concatenation.
Args:
s: bytes holding contents of a PDF file
Returns:
string: the decoded (possibly decompressed) contents of the first page
"""
(trailerdict, crossrefs) = GetPDFTrailerAndCrossrefs(s)
if not trailerdict or not crossrefs:
if WARN:
print('problem finding trailer or crossrefs')
return ''
if 'Root' not in trailerdict:
if WARN:
print('cannot find Root object')
return ''
root = GetTypedValFromDictEntry(trailerdict, 'Root', ODICT, s, crossrefs)
if root is None:
if WARN:
print('cannot find root dictionary')
return ''
pagesdict = GetTypedValFromDictEntry(root, 'Pages', ODICT, s, crossrefs)
if pagesdict is None:
if WARN:
print('cannot find Pages dictionary')
return ''
pnode = pagesdict
while pnode:
pnodetype = PDFDictType(pnode)
if pnodetype == 'Pages':
kidsarray = GetTypedValFromDictEntry(pnode, 'Kids', OARRAY, s,
crossrefs)
if not kidsarray:
if WARN:
print('cannot find Kids in Pages')
return ''
if len(kidsarray) == 0:
if WARN:
print('Kids array has no Page')
return ''
pnodeobj = GetPDFObjFromIndirectRef(kidsarray[0], s, crossrefs)
if PDFObjHasType(pnodeobj, ODICT):
pnode = pnodeobj[1]
else:
if WARN:
print('Kids element has unexpected type')
return ''
elif pnodetype == 'Page':
contentsobj = GetPDFObjFromDictEntry(pnode, 'Contents', s,
crossrefs)
if contentsobj is None:
# it is legal for there to be no contents object:
# means empty page
if WARN:
print('First Page is empty')
return ''
if contentsobj[0] == OSTREAM:
return GetPDFStreamContents(contentsobj, s, crossrefs)
elif contentsobj[0] == OARRAY:
pieces = []
for c in contentsobj[1]:
if not PDFObjHasType(c, OINDIRECTREF):
if WARN:
print('Contents obj child not an indirect ref')
return ''
o = GetPDFObjFromIndirectRef(c, s, crossrefs)
if not PDFObjHasType(o, OSTREAM):
if WARN:
print('Contents obj child not a stream')
return ''
pieces.append(GetPDFStreamContents(o, s, crossrefs))
return '\n'.join(pieces)
else:
if WARN:
print('Contents object has unexpected type',
contentsobj[0])
return ''
else:
if WARN:
print('Page tree node has unexpected type', pnodetype)
return ''
# shouldn't get here
return ''
def GetPDFObjFromIndirectRef(obj, s, crossrefs):
"""Return the Object that is referred to by an indirect reference.
Args:
obj: (int, value) - should be (OINDIRECTREF, (obj_number, gen_number))
s: string - contents of PDF file
crossrefs: dict - maps (obj_number, gen_number) to crossref triple
Returns:
(objectid, value) - the referred value (inside containing OINDIRECTDEF)
or None if there is any problem
"""
if not PDFObjHasType(obj, OINDIRECTREF):
return None
key = obj[1]
if key not in crossrefs:
return None
(f1, f2, f3) = crossrefs[key]
if f1 == XUNCOMPRESSED:
if f2 < 0 or f2 >= len(s):
return None
(o, _) = GetPDFObject(s, f2)
elif f1 == XCOMPRESSED:
o = GetPDFCompressedObject(s, f2, f3, crossrefs)
return o
else:
if WARN:
print("Bad xref type")
return None
if PDFObjHasType(o, OINDIRECTDEF):
return o[1][2]
else:
return None
def GetPDFCompressedObject(s, strnum, oindex, crossrefs):
"""Get one complete object from compressed stream.
Args:
s : bytes holding contents of a PDF file
strnum: object number of object stream where object is
oindex: index of object within the stream
crossrefs: dict - maps (obj_number, gen_number) to crossref triple
Returns:
(objectid, value) - or None, if no such object
"""
strkey = (strnum, 0)
if strkey not in crossrefs:
if WARN:
print("could not find object", strnum, "in crossrefs")
return None
(g1, g2, g3) = crossrefs[strkey]
if g1 != XUNCOMPRESSED:
if WARN:
print("stream object is not uncompressed", g1, g2, g3)
return None
if g2 < 0 or g2 >= len(s):
return None
(ostream, _) = GetPDFObject(s, g2)
if PDFObjHasType(ostream, OINDIRECTDEF):
ostream = ostream[1][2]
if not PDFObjHasType(ostream, OSTREAM):
if WARN:
print("stream object does not have type stream")
return None
streamcont = GetPDFStreamContents(ostream, s, crossrefs, False)
d = ostream[1][0]
ty = GetTypedValFromDictEntry(d, "Type", ONAME, s, crossrefs)
if ty != "ObjStm":
if WARN:
print("stream object does not have Type ObjStm")
return None
n = GetTypedValFromDictEntry(d, "N", ONUM, s, crossrefs)
first = GetTypedValFromDictEntry(d, "First", ONUM, s, crossrefs)
if not n or not first:
if WARN:
print("required n or first not in object stream")
return None
i = 0
ans = None
for count in range(n):
(intpair, i) = GetPDFTwoInts(streamcont, i)
if not intpair:
if WARN:
print("stream object did not find int pair at count", count)
return None
(id, off) = intpair
obj = GetPDFObject(streamcont, first + off)
if count == oindex:
if obj:
ans = obj[0]
break
return ans
def GetPDFObjFromDictEntry(d, entryname, s, crossrefs):
"""Return the PDF object that should be at given entry in d.
Follow any Indirect refs until get a real object.
"""
if entryname not in d:
return None
o = d[entryname]
if PDFObjHasType(o, OINDIRECTREF):
return GetPDFObjFromIndirectRef(o, s, crossrefs)
else:
return o
def GetTypedValFromDictEntry(d, entryname, ty, s, crossrefs):
"""Return the value that should be found by entry in d and have type ty."""
o = GetPDFObjFromDictEntry(d, entryname, s, crossrefs)
if PDFObjHasType(o, ty):
return o[1]
else:
return None
def PDFObjHasType(o, ty):
"""Return True if o, a PDF Object, has type ty."""
if o is None:
return False
return o[0] == ty
def PDFDictType(d):
"""Return string value of 'Type' entry in d, '' if not there."""
if 'Type' in d:
tyobj = d['Type']
if PDFObjHasType(tyobj, ONAME):
return tyobj[1]
return ''
def GetPDFStreamContents(contentsobj, s, crossrefs, dodecode=True):
"""Return the contents of a stream object, applying any needed filters.
For now, only handle FlateDecode filter, and with no DecodeParms.
Args:
contentsobj: (OSTREAM, (dict, istart, iend))
s: bytes - PDF file contents
crossrefs: dict - maps (obj_number, gen_number) to byte offset in s
dodecode: bool - should we decode too?
Returns:
string - the contents (if dodecode, decoded using latin1 decoder)
"""
if not PDFObjHasType(contentsobj, OSTREAM):
return None
(d, istart, _) = contentsobj[1]
length = GetTypedValFromDictEntry(d, 'Length', ONUM, s, crossrefs)
if length is None:
return ''
ans = s[istart:istart + length]
filterobj = GetPDFObjFromDictEntry(d, 'Filter', s, crossrefs)
if filterobj is None:
return ans.decode()
filters = []
if PDFObjHasType(filterobj, ONAME):
filters = [filterobj[1]]
elif PDFObjHasType(filterobj, OARRAY):
for o in filterobj[1]:
if PDFObjHasType(o, ONAME):
filters.append(o[1])
for fname in filters:
if fname == 'FlateDecode':
if not zlib:
raise RuntimeError("pdf decoding requires missing zlib module")
parms = GetTypedValFromDictEntry(d, 'DecodeParms', ODICT, s, crossrefs)
needPngPredictor = False
columns = 1
if parms is not None:
predictor = GetTypedValFromDictEntry(parms, 'Predictor', ONUM, s, crossrefs)
columns = GetTypedValFromDictEntry(parms, 'Columns', ONUM, s, crossrefs)
if predictor is not None:
if predictor == 1:
pass
elif predictor < 10:
if WARN:
print('unhandled predictor type', predictor)
else:
if columns is None:
columns = 1
needPngPredictor = True
ans = zlib.decompress(ans)
if needPngPredictor:
ansbytes = []
col1 = columns + 1
k = 0
currow = [0] * columns
while k + col1 <= len(ans):
if ans[k] != 2:
if WARN:
print('unhandled PNG predictor type: ', ans[k])
k += 1
for j in range(0, columns):
currow[j] = (currow[j] + ans[k + j]) & 0xFF
ansbytes.append(chr(currow[j]))
k += columns
if k != len(ans):
if WARN:
print("FlateDecode with prediction didn't consume all bytes")
ans = ''.join(ansbytes)
if dodecode:
ans = ans.decode(encoding='latin1', errors='ignore')
else:
if WARN:
print('unhandled stream filter', fname)
return ''
return ans
if __name__ == "__main__":
if len(sys.argv) == 2:
page1contents = ReadPDFPageOneContents(sys.argv[1])
sys.stdout.write(page1contents)