1 #! python2
2 # coding: utf-8
3
4 import sys
5 from cStringIO import StringIO
6 from pdfminer import pdfinterp
7 from pdfminer import pdfpage
8 from pdfminer import converter
9 from pdfminer import layout
10
11 with file(path, 'rb') as fp:
12 rsrcmgr = pdfinterp.PDFResourceManager()
13 retstr = StringIO()
14 codec = 'utf-8'
15 laparams = layout.LAParams()
16 device = converter.TextConverter(
17 rsrcmgr, retstr, codec=codec, laparams=laparams)
18 # Create a PDF interpreter object.
19 interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
20 # Process each page contained in the document.
21 pages = pdfpage.PDFPage.get_pages(fp)
22 for page in pages:
23 interpreter.process_page(page)
24 data = retstr.getvalue()