pdfminer的TextConverter得到文件字符无空格解决方法

from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import threading, os

class PdfThread(threading.Thread):
    def __init__(self, in_queue, doc_txt_dir):
        threading.Thread.__init__(self)
        self.in_queue = in_queue
        self.doc_txt_dir = doc_txt_dir

    def run(self):
        while True:
            try:
                codec = 'utf-8'
                in_fname = self.in_queue.get()
                rsrc = PDFResourceManager(caching = True)
                base_name = os.path.basename(in_fname)
                out_file = os.path.join(self.doc_txt_dir, base_name[0:base_name.rfind(".")] + ".txt")
                outfp = file(out_file, 'w')
                laparams = LAParams() #加上此参数可保留原pdf中的字符间空格
                device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
                fp = file(in_fname, 'rb')
                caching = True
                pagenos = set()
                process_pdf(rsrc, device, fp, pagenos, maxpages=0, password='',caching=caching, check_extractable=True)
                fp.close()
                device.close()
                outfp.close()
                print "have convert pdf file %s to file %s" %(in_fname, out_file)
            finally:
                self.in_queue.task_done()
#TagExtractor

 

posted @ 2013-09-30 16:48  springbarley  阅读(1252)  评论(0编辑  收藏  举报