mirror of
https://github.com/blender/blender-addons-contrib.git
synced 2025-08-16 16:14:56 +00:00
916 lines
29 KiB
Python
916 lines
29 KiB
Python
# ##### BEGIN GPL LICENSE BLOCK #####
|
|
#
|
|
# This program is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU General Public License
|
|
# as published by the Free Software Foundation; either version 2
|
|
# of the License, or (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software Foundation,
|
|
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
#
|
|
# ##### END GPL LICENSE BLOCK #####
|
|
|
|
"""Functions for dealing with PDF files.
|
|
"""
|
|
|
|
__author__ = "howard.trickey@gmail.com"
|
|
|
|
import re
|
|
import sys
|
|
try:
|
|
import zlib
|
|
except ImportError:
|
|
zlib = None
|
|
|
|
# Python 2 and 3 differ in the type of result of indexing into
|
|
# a 'bytes' string, so define a function that returns the
|
|
# integer value regardless of the python version.
|
|
if sys.version_info[0] == 3:
|
|
def ordat(b, i):
|
|
return b[i]
|
|
else:
|
|
def ordat(b, i):
|
|
return ord(b[i])
|
|
|
|
WARN = True # print Warnings about strange things?
|
|
|
|
# PDF objects
|
|
OBOOL = 0
|
|
ONUM = 1
|
|
OSTRING = 2
|
|
ONAME = 3
|
|
OARRAY = 4
|
|
ODICT = 5
|
|
OSTREAM = 6
|
|
ONULL = 7
|
|
OINDIRECTDEF = 8
|
|
OINDIRECTREF = 9
|
|
|
|
_re_psbool = re.compile(br'true|false')
|
|
_re_psint = re.compile(br'(\+|-)?[0-9]+')
|
|
_re_psreal = re.compile(br'((\+|-)?([0-9]+\.[0-9]*)|(\.[0-9]+))')
|
|
_re_psstring = re.compile(br'\((\\.|.)*?\)')
|
|
_re_pshexstring = re.compile(br'<[0-9A-Fa-f]*>')
|
|
_re_psname = re.compile(br'/([^\0\t\n\f\r \(\)<>[\]{}/%]*)')
|
|
_re_psnull = re.compile(br'null')
|
|
_re_pskeyword = re.compile(br'[A-Za-z]+')
|
|
_re_psstreameol = re.compile(br'\r\n|\n')
|
|
_re_psendstream = re.compile(br'endstream')
|
|
_re_pseol = re.compile(br'(\r\n|\n|\r)')
|
|
_re_pswhitespaceandcomments = re.compile(br'([\0\t\n\f\r ]|%[^\n\r]*[\n\r]+)*')
|
|
|
|
# Object Notes:
|
|
# The string re is not really right - the spec allows
|
|
# balanced parentheses to appear in a string unescaped.
|
|
# Also, strings need to convert the following escapes:
|
|
# \n \r \t \b \f \( \) \\ \ddd (octal).
|
|
# Octal chars ddd can have 1, 2, or 3 octal digits with high order
|
|
# overflow ignored and leading zeros as needed (but not required,
|
|
# needed only if a normal digit follows the octal one).
|
|
#
|
|
# Hex strings may have odd number of digits - the last is assumed to be 0
|
|
#
|
|
# The token / (with no regular characters after it) is a valid name
|
|
# Names may include arbitrary characters by writing its 2-digit hex
|
|
# code preceded by a '#'.
|
|
#
|
|
# Array objects: sequence of objects enclosed in [ and ]
|
|
#
|
|
# Dictionary objects: sequence of key-value pairs enclosed in << and >>
|
|
# The keys must be names. A dictionary entry whose value is null is
|
|
# equivalent to an absent entry.
|
|
# By convention, dictonary objects have a value with key /Type that
|
|
# identifies the type of dictionary. Sometimes there is a subtype,
|
|
# with key /Subtype or /S. Values of type and subtype are always names.
|
|
#
|
|
# Stream objects: a dictionary followed by zero or more lines of bytes
|
|
# bracketed by keywords "stream" and "endstream". All streams must be
|
|
# indirect objects, and the dictionary must be a direct object.
|
|
#
|
|
# An Indirect Object Definition: two ints (object #, generation #)
|
|
# followed by the object bracketed by "obj" and "endobj"
|
|
#
|
|
# An Indirect Object Reference: two ints (object #, generation #) followed
|
|
# by "R".
|
|
# An indirect object reference to an object not defined in the file
|
|
# is taken as a reference to the null object.
|
|
|
|
|
|
def GetPDFObject(s, i):
|
|
"""Get one complete object starting at s[i].
|
|
|
|
Args:
|
|
s: bytes holding contents of a PDF file
|
|
i: index into s
|
|
Returns:
|
|
((objectid, value), int) - where int is i after the object
|
|
use None instead of (objectid, value) if
|
|
there are no more objects in s
|
|
"""
|
|
|
|
m = _re_pswhitespaceandcomments.match(s, i)
|
|
if m:
|
|
i = m.end()
|
|
if i == len(s):
|
|
return (None, i)
|
|
m = _re_psname.match(s, i)
|
|
if m:
|
|
return ((ONAME, s[m.start() + 1:m.end()].decode()), m.end())
|
|
m = _re_psreal.match(s, i)
|
|
if m:
|
|
return ((ONUM, float(m.group())), m.end())
|
|
m = _re_psint.match(s, i)
|
|
if m:
|
|
# could also be start of indirect object def or ref
|
|
(o, j) = GetPDFIndirectObjectRefOrDef(s, i)
|
|
if o is not None:
|
|
return (o, j)
|
|
else:
|
|
return ((ONUM, int(m.group())), m.end())
|
|
m = _re_psbool.match(s, i)
|
|
if m:
|
|
if m.group == b'true':
|
|
v = True
|
|
else:
|
|
v = False
|
|
return ((OBOOL, v), m.end())
|
|
m = _re_psnull.match(s, i)
|
|
if m:
|
|
return ((ONULL, None), m.end())
|
|
m = _re_psstring.match(s, i)
|
|
if m:
|
|
return GetPDFLiteralString(s, i)
|
|
m = _re_pshexstring.match(s, i)
|
|
if m:
|
|
return GetPDFHexString(s, i, m.end())
|
|
c = ordat(s, i)
|
|
if c == ord('['):
|
|
return GetPDFArray(s, i)
|
|
elif c == ord('<') and i < len(s) - 1 and ordat(s, i + 1) == ord('<'):
|
|
(o, j) = GetPDFDict(s, i)
|
|
# check if followed by stream
|
|
(w, k) = GetPDFKeyword(s, j)
|
|
if w == b'stream':
|
|
m = _re_psstreameol.match(s, k)
|
|
if m:
|
|
streamstart = m.end()
|
|
streamend = s.find(b'endstream', streamstart)
|
|
if streamend > 0:
|
|
# just return byte offsets in s where stream
|
|
# contents start and (most probably) end
|
|
return ((OSTREAM, (o[1], streamstart, streamend)),
|
|
streamend + 9)
|
|
return (o, j)
|
|
return (None, i + 1)
|
|
|
|
|
|
def GetPDFIndirectObjectRefOrDef(s, i):
|
|
"""Get an indirect object def or ref starting at s[i].
|
|
|
|
Args:
|
|
s: bytes holding contents of a PDF file
|
|
i: index into s
|
|
Returns:
|
|
((OINDIRECTDEF, (obj#,gen#,obj)), newi)
|
|
or ((OINDIRECTREF, (obj#, gen#)), newi)
|
|
or (None, i)
|
|
"""
|
|
|
|
(v, j) = GetPDFTwoInts(s, i)
|
|
if v is None:
|
|
return (None, i)
|
|
(obj_number, gen_number) = v
|
|
(w, j) = GetPDFKeyword(s, j)
|
|
if w == b'R':
|
|
return ((OINDIRECTREF, (obj_number, gen_number)), j)
|
|
elif w == b'obj':
|
|
(obj, j) = GetPDFObject(s, j)
|
|
if obj is not None:
|
|
(w, j) = GetPDFKeyword(s, j)
|
|
if w == b'endobj':
|
|
return ((OINDIRECTDEF, (obj_number, gen_number, obj)), j)
|
|
return (None, i)
|
|
|
|
|
|
def GetPDFTwoInts(s, i):
|
|
"""If there are two ints starting at s[i], return them, else return None.
|
|
|
|
Args:
|
|
s: PDF file contents
|
|
i: index into s
|
|
Returns:
|
|
((int, int), newi) or (None, i)
|
|
"""
|
|
|
|
m = _re_pswhitespaceandcomments.match(s, i)
|
|
j = i
|
|
if m:
|
|
j = m.end()
|
|
if j == len(s):
|
|
return (None, i)
|
|
m = _re_psint.match(s, j)
|
|
if m:
|
|
a = int(m.group())
|
|
j = m.end()
|
|
m = _re_pswhitespaceandcomments.match(s, j)
|
|
if m:
|
|
j = m.end()
|
|
if j == len(s):
|
|
return (None, i)
|
|
m = _re_psint.match(s, j)
|
|
if m:
|
|
b = int(m.group())
|
|
return ((a, b), m.end())
|
|
return (None, i)
|
|
|
|
|
|
def GetPDFKeyword(s, i):
|
|
"""Get a keyword (alphabetic chars, as byte string) starting at s[i].
|
|
|
|
If there is not a keyword there, just return (b'',i).
|
|
|
|
Args:
|
|
s: bytes holding contents of a PDF file
|
|
i: index into s
|
|
Returns:
|
|
(keyword, newi) - keyword will be b'' if couldn't find one
|
|
"""
|
|
|
|
m = _re_pswhitespaceandcomments.match(s, i)
|
|
j = i
|
|
if m:
|
|
j = m.end()
|
|
if j == len(s):
|
|
return (b'', i)
|
|
m = _re_pskeyword.match(s, j)
|
|
if m:
|
|
return (m.group(), m.end())
|
|
return (b'', i)
|
|
|
|
|
|
def GetPDFLiteralString(s, i):
|
|
"""Convert and return object for pdf literal string starting at s[i]."""
|
|
|
|
j = i + 1
|
|
balen = 0
|
|
v = []
|
|
while j < len(s):
|
|
c = ordat(s, j)
|
|
if c == ord(')'):
|
|
if balen == 0:
|
|
return ((OSTRING, ''.join(v)), j + 1)
|
|
else:
|
|
balen -= 1
|
|
elif c == ord('('):
|
|
balen += 1
|
|
elif c == ord('\\'):
|
|
j += 1
|
|
if j == len(s):
|
|
if WARN:
|
|
print("unterminated string at", i)
|
|
return ((OSTRING, ''.join(v)), j)
|
|
c = ordat(s, j)
|
|
if c == ord('n'):
|
|
v += '\n'
|
|
elif c == ord('f'):
|
|
v += '\f'
|
|
elif c == ord('r'):
|
|
v += '\r'
|
|
elif c == ord('t'):
|
|
v += '\t'
|
|
elif ord('0') <= c <= ord('7'):
|
|
x = c - ord('0')
|
|
j += 1
|
|
if j < len(s):
|
|
c = ordat(s, j)
|
|
if ord('0') <= c <= ord('7'):
|
|
x = x * 8 + c - ord('0')
|
|
j += 1
|
|
if j < len(s):
|
|
c = ordat(s, j)
|
|
if ord('0') <= c <= ord('7'):
|
|
x = x * 8 + c - ord('0')
|
|
if x >= 256:
|
|
x %= 256
|
|
v += chr(x)
|
|
else:
|
|
m = _re_pseol.match(s, j)
|
|
if m:
|
|
# backslash used for line continuation
|
|
j = m.end() - 1 # -1 to compensate for +1 that will happen
|
|
else:
|
|
v += chr(c)
|
|
else:
|
|
m = _re_pseol.match(s, j)
|
|
if m:
|
|
v += '\n'
|
|
j = m.end() - 1 # -1 to compensate for +1
|
|
else:
|
|
v += chr(c)
|
|
j += 1
|
|
if WARN:
|
|
print('unterminated string at', i)
|
|
return ((OSTRING, ''.join(v)), j)
|
|
|
|
|
|
def GetPDFHexString(s, i, iend):
|
|
"""Convert and return pdf hex string starting at s[i],
|
|
ending at s[iend-1]."""
|
|
|
|
j = i + 1
|
|
v = []
|
|
c = ''
|
|
jend = iend - 1
|
|
while j < jend:
|
|
p = _re_pswhitespaceandcomments.match(s, j)
|
|
if p:
|
|
j = p.end()
|
|
d = chr(ordat(s, j))
|
|
if c != '':
|
|
v.append(FromHexPair(c, d))
|
|
c = ''
|
|
else:
|
|
c = d
|
|
j += 1
|
|
if c != '':
|
|
v.append(FromHexPair(c, '0'))
|
|
return ((OSTRING, ''.join(v)), iend)
|
|
|
|
|
|
def FromHexPair(a, b):
|
|
"""Interpret string a+b as hex pair, and return the pair's value."""
|
|
|
|
try:
|
|
v = int(a + b, 16)
|
|
except TypeError:
|
|
if WARN:
|
|
print('funny hex pair', a + b)
|
|
v = 0
|
|
return chr(v)
|
|
|
|
|
|
def GetPDFArray(s, i):
|
|
"""Convert and return object array starting at s[i]."""
|
|
|
|
j = i + 1
|
|
v = []
|
|
while j < len(s):
|
|
m = _re_pswhitespaceandcomments.match(s, j)
|
|
if m:
|
|
j = m.end()
|
|
if j == len(s):
|
|
break
|
|
if ordat(s, j) == ord(']'):
|
|
return ((OARRAY, v), j + 1)
|
|
(o, j) = GetPDFObject(s, j)
|
|
if o is None:
|
|
break
|
|
v.append(o)
|
|
if WARN:
|
|
print('unterminated array starting at', i)
|
|
return ((OARRAY, v), j)
|
|
|
|
|
|
def GetPDFDict(s, i):
|
|
"""Convert and return object dict starting at s[i]."""
|
|
|
|
j = i + 2
|
|
v = {}
|
|
while j < len(s):
|
|
m = _re_pswhitespaceandcomments.match(s, j)
|
|
if m:
|
|
j = m.end()
|
|
if j == len(s):
|
|
break
|
|
if ordat(s, j) == ord('>') and ordat(s, j + 1) == ord('>'):
|
|
return ((ODICT, v), j + 2)
|
|
(o, j) = GetPDFObject(s, j)
|
|
if o is None:
|
|
break
|
|
if o[0] != ONAME:
|
|
if WARN:
|
|
print('expected name at', i)
|
|
break
|
|
name = o[1]
|
|
(o, j) = GetPDFObject(s, j)
|
|
if o is None:
|
|
break
|
|
v[name] = o
|
|
if WARN:
|
|
print('unterminated dict starting at', i)
|
|
return ((ODICT, v), j)
|
|
|
|
# Crossref dict:
|
|
# Cross references are a way of turning an (object #, generation #) into
|
|
# an actual object in the file, when and indirect reference of the form
|
|
# object# generation# R
|
|
# is found in another object.
|
|
# Cross references are of two types:
|
|
# 1) uncompressed: you find the object at a specified byte offset in the file
|
|
# 2) compressed: you find the object in an object stream which is in turn found
|
|
# by looking for a specified object# with implicit generation 0.
|
|
# We will build a map from (object#, generation#) to a tuple
|
|
# (kind, field2, field3)
|
|
# where if kind==XUNCOMPRESSED then field2 is the file byte offset of the object and field2
|
|
# is its generation #
|
|
# and if kind==XCOMPRESSED then field2 is the object # of the object stream containing it,
|
|
# and field3 is the index of that object within the stream
|
|
XUNCOMPRESSED = 1
|
|
XCOMPRESSED = 2
|
|
|
|
def GetPDFTrailerAndCrossrefs(s):
|
|
"""Find and return the (last) PDF trailer dictionary and cross reference
|
|
dict.
|
|
|
|
Args:
|
|
s: PDF file (as bytes)
|
|
Returns:
|
|
(trailer dict, crossref dict)
|
|
"""
|
|
|
|
startxrefi = s.rfind(b'startxref')
|
|
if startxrefi == -1:
|
|
if WARN:
|
|
print('cannot find startxref')
|
|
return (None, None)
|
|
crossrefi = -1
|
|
m = _re_pseol.match(s, startxrefi + 9)
|
|
if m:
|
|
m2 = _re_psint.match(s, m.end())
|
|
if m2:
|
|
crossrefi = int(m2.group())
|
|
if crossrefi <= 0:
|
|
if WARN:
|
|
print('cannot find crossref index')
|
|
return (None, None)
|
|
crossrefs = {}
|
|
d = None
|
|
last_trailerdict = None
|
|
if s[crossrefi:crossrefi+4] != b'xref':
|
|
# Could be Crossref stream
|
|
(obj, j) = GetPDFObject(s, crossrefi)
|
|
if PDFObjHasType(obj, OINDIRECTDEF):
|
|
strobj = obj[1][2]
|
|
if PDFObjHasType(strobj, OSTREAM):
|
|
strxrefs = GetPDFStreamContents(strobj, s, {}, False)
|
|
if strxrefs is None:
|
|
if WARN:
|
|
print('cannot decode crossref stream')
|
|
return (None, {})
|
|
d = strobj[1][0]
|
|
w = GetTypedValFromDictEntry(d, 'W', OARRAY, s, {})
|
|
ty = GetTypedValFromDictEntry(d, 'Type', ONAME, s, {})
|
|
sz = GetTypedValFromDictEntry(d, 'Size', ONUM, s, {})
|
|
index = GetTypedValFromDictEntry(d, 'Index', OARRAY, s, {})
|
|
prev = GetTypedValFromDictEntry(d, 'Prev', ONUM, s, {})
|
|
if ty != 'XRef' or sz is None or w is None:
|
|
if WARN:
|
|
print('something wrong with XRef stream dictionary')
|
|
return (None, {})
|
|
n1 = w[0][1]
|
|
n2 = w[1][1]
|
|
n3 = w[2][1]
|
|
ntot = n1 + n2 + n3
|
|
firstobjnum = 0
|
|
numobjs = sz
|
|
if index is not None:
|
|
firstobjnum = index[0][1]
|
|
numobjs = index[1][1]
|
|
k = 0
|
|
objnum = firstobjnum
|
|
while k + ntot <= len(strxrefs):
|
|
if n1 == 0:
|
|
f1 = 1
|
|
else:
|
|
(f1, k) = GetPDFMultiByteInt(strxrefs, k, n1)
|
|
(f2, k) = GetPDFMultiByteInt(strxrefs, k, n2)
|
|
if n3 == 0:
|
|
f3 = 0
|
|
else:
|
|
(f3, k) = GetPDFMultiByteInt(strxrefs, k, n3)
|
|
if f1 == 1:
|
|
crossrefs[(objnum, f3)] = (XUNCOMPRESSED, f2, f3)
|
|
elif f1 == 2:
|
|
crossrefs[(objnum, 0)] = (XCOMPRESSED, f2, f3)
|
|
elif f1 != 0:
|
|
if WARN:
|
|
print('unexpected type in XRef:', f1)
|
|
return (None, {})
|
|
objnum += 1
|
|
else:
|
|
if WARN:
|
|
print("no xref and object there is not stream")
|
|
print (obj)
|
|
else:
|
|
if WARN:
|
|
print("no xref and not indirect def")
|
|
print(obj)
|
|
return (d, crossrefs)
|
|
while crossrefi > 0:
|
|
i = crossrefi
|
|
if s[i:i + 4] != b'xref':
|
|
if WARN:
|
|
print('cannot find xref')
|
|
break
|
|
m = _re_pseol.match(s, i + 4)
|
|
if m:
|
|
i = m.end()
|
|
while i < startxrefi:
|
|
# Get start of subsection
|
|
(v, i) = GetPDFTwoInts(s, i)
|
|
if v is None:
|
|
break
|
|
(idstart, nentries) = v
|
|
m = _re_pswhitespaceandcomments.match(s, i)
|
|
if m:
|
|
i = m.end()
|
|
for k in range(idstart, idstart + nentries):
|
|
byteoffset = int(s[i:i + 10])
|
|
gen = int(s[i + 11:i + 16])
|
|
inuse = (ordat(s, i + 17) == ord('n'))
|
|
if inuse:
|
|
crossrefs[(k, gen)] = (XUNCOMPRESSED, byteoffset, gen)
|
|
i += 20
|
|
# Should be at 'trailer' now
|
|
(w, i) = GetPDFKeyword(s, i)
|
|
if w != b'trailer':
|
|
if WARN:
|
|
print('cannot find trailer')
|
|
break
|
|
(trailero, i) = GetPDFObject(s, i)
|
|
if trailero is None or trailero[0] != ODICT:
|
|
if WARN:
|
|
print('cannot find trailer dict')
|
|
break
|
|
trailerdict = trailero[1]
|
|
if last_trailerdict is None:
|
|
last_trailerdict = trailerdict
|
|
if 'Prev' in trailerdict:
|
|
crossrefi = GetTypedValFromDictEntry(trailerdict, 'Prev', ONUM, s, crossrefs)
|
|
if crossrefi is None:
|
|
crossrefi = -1
|
|
else:
|
|
crossrefi = -1
|
|
return (last_trailerdict, crossrefs)
|
|
|
|
def GetPDFMultiByteInt(s, i, fieldlen):
|
|
"""Get a multibyte int from a string of bytes
|
|
|
|
Args:
|
|
s: string of bytes
|
|
i: int, offset in s to start getting the result
|
|
fieldlen: int, how many bytes to get
|
|
Returns:
|
|
int: accumulated multibyte value (high order byte first in s)
|
|
"""
|
|
|
|
ans = 0
|
|
for k in range(i, i + fieldlen):
|
|
ans = ans * 256 + ord(s[k])
|
|
return (ans, i + fieldlen)
|
|
|
|
def ReadPDFPageOneContents(filename):
|
|
"""Read a PDF file and return Content string for its first page.
|
|
|
|
Args:
|
|
filename: name of file
|
|
Returns:
|
|
string: Content string for first page
|
|
"""
|
|
|
|
try:
|
|
f = open(filename, "rb") # binary since some parts may be compressed
|
|
except IOError:
|
|
if WARN:
|
|
print("Can't open file", filename)
|
|
return ''
|
|
contents = f.read()
|
|
f.close()
|
|
return GetPDFPageOneContents(contents)
|
|
|
|
|
|
def GetPDFPageOneContents(s):
|
|
"""Find and return first page in PDF file, given as string.
|
|
|
|
First get the last trailer's dictionary, which should contain
|
|
the Root object, and also the crossref dictionary which gives
|
|
byte offsets for all indirect objects.
|
|
Then from Root object, find Pages object (a page tree), and
|
|
follow leftmost Kid until get to a leaf Page object, which
|
|
in turn has the desired Contents object, which is a stream
|
|
or an array of streams. Decompress (if necessary) the
|
|
stream(s) and return their concatenation.
|
|
|
|
Args:
|
|
s: bytes holding contents of a PDF file
|
|
Returns:
|
|
string: the decoded (possibly decompressed) contents of the first page
|
|
"""
|
|
|
|
(trailerdict, crossrefs) = GetPDFTrailerAndCrossrefs(s)
|
|
if not trailerdict or not crossrefs:
|
|
if WARN:
|
|
print('problem finding trailer or crossrefs')
|
|
return ''
|
|
if 'Root' not in trailerdict:
|
|
if WARN:
|
|
print('cannot find Root object')
|
|
return ''
|
|
root = GetTypedValFromDictEntry(trailerdict, 'Root', ODICT, s, crossrefs)
|
|
if root is None:
|
|
if WARN:
|
|
print('cannot find root dictionary')
|
|
return ''
|
|
pagesdict = GetTypedValFromDictEntry(root, 'Pages', ODICT, s, crossrefs)
|
|
if pagesdict is None:
|
|
if WARN:
|
|
print('cannot find Pages dictionary')
|
|
return ''
|
|
pnode = pagesdict
|
|
while pnode:
|
|
pnodetype = PDFDictType(pnode)
|
|
if pnodetype == 'Pages':
|
|
kidsarray = GetTypedValFromDictEntry(pnode, 'Kids', OARRAY, s,
|
|
crossrefs)
|
|
if not kidsarray:
|
|
if WARN:
|
|
print('cannot find Kids in Pages')
|
|
return ''
|
|
if len(kidsarray) == 0:
|
|
if WARN:
|
|
print('Kids array has no Page')
|
|
return ''
|
|
pnodeobj = GetPDFObjFromIndirectRef(kidsarray[0], s, crossrefs)
|
|
if PDFObjHasType(pnodeobj, ODICT):
|
|
pnode = pnodeobj[1]
|
|
else:
|
|
if WARN:
|
|
print('Kids element has unexpected type')
|
|
return ''
|
|
elif pnodetype == 'Page':
|
|
contentsobj = GetPDFObjFromDictEntry(pnode, 'Contents', s,
|
|
crossrefs)
|
|
if contentsobj is None:
|
|
# it is legal for there to be no contents object:
|
|
# means empty page
|
|
if WARN:
|
|
print('First Page is empty')
|
|
return ''
|
|
if contentsobj[0] == OSTREAM:
|
|
return GetPDFStreamContents(contentsobj, s, crossrefs)
|
|
elif contentsobj[0] == OARRAY:
|
|
pieces = []
|
|
for c in contentsobj[1]:
|
|
if not PDFObjHasType(c, OINDIRECTREF):
|
|
if WARN:
|
|
print('Contents obj child not an indirect ref')
|
|
return ''
|
|
o = GetPDFObjFromIndirectRef(c, s, crossrefs)
|
|
if not PDFObjHasType(o, OSTREAM):
|
|
if WARN:
|
|
print('Contents obj child not a stream')
|
|
return ''
|
|
pieces.append(GetPDFStreamContents(o, s, crossrefs))
|
|
return '\n'.join(pieces)
|
|
else:
|
|
if WARN:
|
|
print('Contents object has unexpected type',
|
|
contentsobj[0])
|
|
return ''
|
|
else:
|
|
if WARN:
|
|
print('Page tree node has unexpected type', pnodetype)
|
|
return ''
|
|
# shouldn't get here
|
|
return ''
|
|
|
|
|
|
def GetPDFObjFromIndirectRef(obj, s, crossrefs):
|
|
"""Return the Object that is referred to by an indirect reference.
|
|
|
|
Args:
|
|
obj: (int, value) - should be (OINDIRECTREF, (obj_number, gen_number))
|
|
s: string - contents of PDF file
|
|
crossrefs: dict - maps (obj_number, gen_number) to crossref triple
|
|
Returns:
|
|
(objectid, value) - the referred value (inside containing OINDIRECTDEF)
|
|
or None if there is any problem
|
|
"""
|
|
|
|
if not PDFObjHasType(obj, OINDIRECTREF):
|
|
return None
|
|
key = obj[1]
|
|
if key not in crossrefs:
|
|
return None
|
|
(f1, f2, f3) = crossrefs[key]
|
|
if f1 == XUNCOMPRESSED:
|
|
if f2 < 0 or f2 >= len(s):
|
|
return None
|
|
(o, _) = GetPDFObject(s, f2)
|
|
elif f1 == XCOMPRESSED:
|
|
o = GetPDFCompressedObject(s, f2, f3, crossrefs)
|
|
return o
|
|
else:
|
|
if WARN:
|
|
print("Bad xref type")
|
|
return None
|
|
if PDFObjHasType(o, OINDIRECTDEF):
|
|
return o[1][2]
|
|
else:
|
|
return None
|
|
|
|
|
|
def GetPDFCompressedObject(s, strnum, oindex, crossrefs):
|
|
"""Get one complete object from compressed stream.
|
|
|
|
Args:
|
|
s : bytes holding contents of a PDF file
|
|
strnum: object number of object stream where object is
|
|
oindex: index of object within the stream
|
|
crossrefs: dict - maps (obj_number, gen_number) to crossref triple
|
|
Returns:
|
|
(objectid, value) - or None, if no such object
|
|
"""
|
|
|
|
strkey = (strnum, 0)
|
|
if strkey not in crossrefs:
|
|
if WARN:
|
|
print("could not find object", strnum, "in crossrefs")
|
|
return None
|
|
(g1, g2, g3) = crossrefs[strkey]
|
|
if g1 != XUNCOMPRESSED:
|
|
if WARN:
|
|
print("stream object is not uncompressed", g1, g2, g3)
|
|
return None
|
|
if g2 < 0 or g2 >= len(s):
|
|
return None
|
|
(ostream, _) = GetPDFObject(s, g2)
|
|
if PDFObjHasType(ostream, OINDIRECTDEF):
|
|
ostream = ostream[1][2]
|
|
if not PDFObjHasType(ostream, OSTREAM):
|
|
if WARN:
|
|
print("stream object does not have type stream")
|
|
return None
|
|
streamcont = GetPDFStreamContents(ostream, s, crossrefs, False)
|
|
d = ostream[1][0]
|
|
ty = GetTypedValFromDictEntry(d, "Type", ONAME, s, crossrefs)
|
|
if ty != "ObjStm":
|
|
if WARN:
|
|
print("stream object does not have Type ObjStm")
|
|
return None
|
|
n = GetTypedValFromDictEntry(d, "N", ONUM, s, crossrefs)
|
|
first = GetTypedValFromDictEntry(d, "First", ONUM, s, crossrefs)
|
|
if not n or not first:
|
|
if WARN:
|
|
print("required n or first not in object stream")
|
|
return None
|
|
i = 0
|
|
ans = None
|
|
for count in range(n):
|
|
(intpair, i) = GetPDFTwoInts(streamcont, i)
|
|
if not intpair:
|
|
if WARN:
|
|
print("stream object did not find int pair at count", count)
|
|
return None
|
|
(id, off) = intpair
|
|
obj = GetPDFObject(streamcont, first + off)
|
|
if count == oindex:
|
|
if obj:
|
|
ans = obj[0]
|
|
break
|
|
return ans
|
|
|
|
|
|
def GetPDFObjFromDictEntry(d, entryname, s, crossrefs):
|
|
"""Return the PDF object that should be at given entry in d.
|
|
|
|
Follow any Indirect refs until get a real object.
|
|
"""
|
|
|
|
if entryname not in d:
|
|
return None
|
|
o = d[entryname]
|
|
if PDFObjHasType(o, OINDIRECTREF):
|
|
return GetPDFObjFromIndirectRef(o, s, crossrefs)
|
|
else:
|
|
return o
|
|
|
|
|
|
def GetTypedValFromDictEntry(d, entryname, ty, s, crossrefs):
|
|
"""Return the value that should be found by entry in d and have type ty."""
|
|
|
|
o = GetPDFObjFromDictEntry(d, entryname, s, crossrefs)
|
|
if PDFObjHasType(o, ty):
|
|
return o[1]
|
|
else:
|
|
return None
|
|
|
|
|
|
def PDFObjHasType(o, ty):
|
|
"""Return True if o, a PDF Object, has type ty."""
|
|
|
|
if o is None:
|
|
return False
|
|
return o[0] == ty
|
|
|
|
|
|
def PDFDictType(d):
|
|
"""Return string value of 'Type' entry in d, '' if not there."""
|
|
|
|
if 'Type' in d:
|
|
tyobj = d['Type']
|
|
if PDFObjHasType(tyobj, ONAME):
|
|
return tyobj[1]
|
|
return ''
|
|
|
|
|
|
def GetPDFStreamContents(contentsobj, s, crossrefs, dodecode=True):
|
|
"""Return the contents of a stream object, applying any needed filters.
|
|
|
|
For now, only handle FlateDecode filter, and with no DecodeParms.
|
|
|
|
Args:
|
|
contentsobj: (OSTREAM, (dict, istart, iend))
|
|
s: bytes - PDF file contents
|
|
crossrefs: dict - maps (obj_number, gen_number) to byte offset in s
|
|
dodecode: bool - should we decode too?
|
|
Returns:
|
|
string - the contents (if dodecode, decoded using latin1 decoder)
|
|
"""
|
|
|
|
if not PDFObjHasType(contentsobj, OSTREAM):
|
|
return None
|
|
(d, istart, _) = contentsobj[1]
|
|
length = GetTypedValFromDictEntry(d, 'Length', ONUM, s, crossrefs)
|
|
if length is None:
|
|
return ''
|
|
ans = s[istart:istart + length]
|
|
filterobj = GetPDFObjFromDictEntry(d, 'Filter', s, crossrefs)
|
|
if filterobj is None:
|
|
return ans.decode()
|
|
filters = []
|
|
if PDFObjHasType(filterobj, ONAME):
|
|
filters = [filterobj[1]]
|
|
elif PDFObjHasType(filterobj, OARRAY):
|
|
for o in filterobj[1]:
|
|
if PDFObjHasType(o, ONAME):
|
|
filters.append(o[1])
|
|
for fname in filters:
|
|
if fname == 'FlateDecode':
|
|
if not zlib:
|
|
raise RuntimeError("pdf decoding requires missing zlib module")
|
|
parms = GetTypedValFromDictEntry(d, 'DecodeParms', ODICT, s, crossrefs)
|
|
needPngPredictor = False
|
|
columns = 1
|
|
if parms is not None:
|
|
predictor = GetTypedValFromDictEntry(parms, 'Predictor', ONUM, s, crossrefs)
|
|
columns = GetTypedValFromDictEntry(parms, 'Columns', ONUM, s, crossrefs)
|
|
if predictor is not None:
|
|
if predictor == 1:
|
|
pass
|
|
elif predictor < 10:
|
|
if WARN:
|
|
print('unhandled predictor type', predictor)
|
|
else:
|
|
if columns is None:
|
|
columns = 1
|
|
needPngPredictor = True
|
|
ans = zlib.decompress(ans)
|
|
if needPngPredictor:
|
|
ansbytes = []
|
|
col1 = columns + 1
|
|
k = 0
|
|
currow = [0] * columns
|
|
while k + col1 <= len(ans):
|
|
if ans[k] != 2:
|
|
if WARN:
|
|
print('unhandled PNG predictor type: ', ans[k])
|
|
k += 1
|
|
for j in range(0, columns):
|
|
currow[j] = (currow[j] + ans[k + j]) & 0xFF
|
|
ansbytes.append(chr(currow[j]))
|
|
k += columns
|
|
if k != len(ans):
|
|
if WARN:
|
|
print("FlateDecode with prediction didn't consume all bytes")
|
|
ans = ''.join(ansbytes)
|
|
if dodecode:
|
|
ans = ans.decode(encoding='latin1', errors='ignore')
|
|
else:
|
|
if WARN:
|
|
print('unhandled stream filter', fname)
|
|
return ''
|
|
return ans
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) == 2:
|
|
page1contents = ReadPDFPageOneContents(sys.argv[1])
|
|
sys.stdout.write(page1contents)
|