1 import os
2 from pdfminer.pdfparser import PDFParser
3 from pdfminer.pdfdocument import PDFDocument
4 from pdfminer.pdfpage import PDFPage
5 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
6 from pdfminer.pdfinterp import PDFResourceManager
7 from pdfminer.pdfinterp import PDFPageInterpreter
8 from pdfminer.pdfdevice import PDFDevice
9 from pdfminer.layout import *
10 from pdfminer.converter import PDFPageAggregator
11
12
13 import os
14 import pdb
15
16 #inputFile = r'D:\用户目录\桌面\340xxxxxxxxxxxxxxxxxx0.pdf'
17
18
19 def decode_text(s):
20 """
21 Decodes a PDFDocEncoding string to Unicode.
22 Adds py3 compatability to pdfminer's version.
23 """
24 if type(s) == bytes and s.startswith(b'\xfe\xff'):
25 return six.text_type(s[2:], 'utf-16be', 'ignore')
26 else:
27 ords = (ord(c) if type(c) == str else c for c in s)
28 return ''.join(PDFDocEncoding[o] for o in ords)
29
30
31
32 def get_msgs(inputFile):
33 msgs = []
34 fp = open(inputFile, 'rb')
35 #来创建一个pdf文档分析器
36 parser = PDFParser(fp)
37 #创建一个PDF文档对象存储文档结构
38 document = PDFDocument(parser)
39 # 检查文件是否允许文本提取
40 if not document.is_extractable:
41 raise PDFTextExtractionNotAllowed
42 else:
43 # 创建一个PDF资源管理器对象来存储共赏资源
44 rsrcmgr=PDFResourceManager()
45 # 设定参数进行分析
46 laparams=LAParams()
47 # 创建一个PDF设备对象
48 # device=PDFDevice(rsrcmgr)
49 device=PDFPageAggregator(rsrcmgr,laparams=laparams)
50 # 创建一个PDF解释器对象
51 interpreter=PDFPageInterpreter(rsrcmgr,device)
52
53 # 处理每一页
54 for page in PDFPage.create_pages(document):
55
56 interpreter.process_page(page)
57
58 # 接受该页面的LTPage对象
59 layout=device.get_result()
60
61 for x in layout:
62
63 if(isinstance(x,LTTextBoxHorizontal)):
64 #print(x.get_text().strip())
65
66 msgs.append(x.get_text().strip())
67
68 return msgs
69
70
71
72 #print(msgs[5][5:]+ '\t' + msgs[4][4:])
73
74
75
76 if __name__ == "__main__":
77 names = os.listdir('.')
78 for i in names:
79 if os.path.splitext(i)[-1] == '.pdf':
80 #print(i)
81 msg = get_msgs(i)
82 #print(msg)
83 ms = msg[5][5:]+ '\t' + msg[4][4:]
84 with open('学生信息表.txt','a') as f:
85 f.write(ms+'\n')
86