diction

程序人生
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理
using System;
using System.IO;
using iTextSharp.text.pdf;

namespace PdfToText
{
    
/// <summary>
    
/// Parses a PDF file and extracts the text from it.
    
/// </summary>

    public class PDFParser 
    
{
        
/// BT = Beginning of a text object operator 
        
/// ET = End of a text object operator
        
/// Td move to the start of next line
        
///  5 Ts = superscript
        
/// -5 Ts = subscript


        
#region Fields

        
#region _numberOfCharsToKeep
        
/// <summary>
        
/// The number of characters to keep, when extracting text.
        
/// </summary>

        private static int _numberOfCharsToKeep = 15;
        
#endregion


        
#endregion


        
#region ExtractText
        
/// <summary>
        
/// Extracts a text from a PDF file.
        
/// </summary>
        
/// <param name="inFileName">the full path to the pdf file.</param>
        
/// <param name="outFileName">the output file name.</param>
        
/// <returns>the extracted text</returns>

        public bool ExtractText(string inFileName, string outFileName)
        
{
            StreamWriter outFile 
= null;
            
try
            
{
                
// Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                
//outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
                
                Console.Write(
"Processing: ");
                
                
int     totalLen    = 68;
                
float   charUnit    = ((float)totalLen) / (float)reader.NumberOfPages;
                
int     totalWritten= 0;
                
float   curUnit     = 0;

                
for (int page = 1; page <= reader.NumberOfPages; page++)
                
{                    
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) 
+ " ");
                    
                    
// Write the progress.
                    if (charUnit >= 1.0f)
                    
{
                        
for (int i = 0; i < (int)charUnit; i++)
                        
{
                            Console.Write(
"#");
                            totalWritten
++;
                        }

                    }

                    
else
                    
{
                        curUnit 
+= charUnit;
                        
if (curUnit >= 1.0f)
                        
{
                            
for (int i = 0; i < (int)curUnit; i++)
                            
{
                                Console.Write(
"#");
                                totalWritten
++;
                            }

                            curUnit 
= 0;
                        }

                        
                    }

                }


                
if (totalWritten < totalLen)
                
{
                    
for (int i = 0; i < (totalLen - totalWritten); i++)
                    
{
                        Console.Write(
"#");
                    }

                }

                
return true;
            }

            
catch
            
{
                
return false;
            }

            
finally
            
{
                
if (outFile != null) outFile.Close();
            }

        }

        
#endregion


        
#region ExtractTextFromPDFBytes
        
/// <summary>
        
/// This method processes an uncompressed Adobe (text) object 
        
/// and extracts text.
        
/// </summary>
        
/// <param name="input">uncompressed</param>
        
/// <returns></returns>

        private string ExtractTextFromPDFBytes(byte[] input)
        
{
            
if (input == null || input.Length == 0return "";

            
try
            
{
                
string resultString = "";

                
// Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                
// Flag showing if the next character is literal 
                
// e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                
// () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                
// Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                
for (int i = 0; i < input.Length; i++)
                
{
                    
char c = (char)input[i];

                    
if (inTextObject)
                    
{
                        
// Position the text
                        if (bracketDepth == 0)
                        
{
                            
if (CheckToken(new string[] "TD""Td" }, previousCharacters))
                            
{
                                resultString 
+= "\n\r";
                            }

                            
else
                            
{
                                
if (CheckToken(new string[] {"'""T*""\""}, previousCharacters))
                                {
                                    resultString 
+= "\n";
                                }

                                
else
                                
{
                                    
if (CheckToken(new string[] "Tj" }, previousCharacters))
                                    
{
                                        resultString 
+= " ";
                                    }

                                }

                            }

                        }


                        
// End of a text object, also go to a new line.
                        if (bracketDepth == 0 && 
                            CheckToken( 
new string[]{"ET"}, previousCharacters))
                        
{

                            inTextObject 
= false;
                            resultString 
+= " ";
                        }

                        
else
                        
{
                            
// Start outputting text
                            if ((c == '('&& (bracketDepth == 0&& (!nextLiteral))
                            
{
                                bracketDepth 
= 1;
                            }

                            
else
                            
{
                                
// Stop outputting text
                                if ((c == ')'&& (bracketDepth == 1&& (!nextLiteral))
                                
{
                                    bracketDepth 
= 0;
                                }

                                
else
                                
{
                                    
// Just a normal text character:
                                    if (bracketDepth == 1)
                                    
{
                                        
// Only print out next character no matter what. 
                                        
// Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        
{
                                            nextLiteral 
= true;
                                        }

                                        
else
                                        
{
                                            
if (((c >= ' '&& (c <= '~')) ||
                                                ((c 
>= 128&& (c < 255)))
                                            
{
                                                resultString 
+= c.ToString();
                                            }


                                            nextLiteral 
= false;
                                        }

                                    }

                                }

                            }

                        }

                    }


                    
// Store the recent characters for 
                    
// when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    
{
                        previousCharacters[j] 
= previousCharacters[j + 1];
                    }

                    previousCharacters[_numberOfCharsToKeep 
- 1= c;

                    
// Start of a text object
                    if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))
                    
{
                        inTextObject 
= true;
                    }

                }

                
return resultString;
            }

            
catch
            
{
                
return "";
            }

        }

        
#endregion


        
CheckToken
    }

}


usage:
using System;
using System.Text;
using System.IO;

namespace PdfToText
{
    
/// <summary>
    
/// The main entry point to the program.
    
/// </summary>

    class Program
    
{
        
static void Main(string[] args)
        
{
            
try
            
{
                
if (args.Length < 1)
                
{
                    DisplayUsage();
                    
return;
                }


                
string file = args[0];
                
if (!File.Exists(file))
                
{
                    file 
= Path.GetFullPath(file);
                    
if (!File.Exists(file))
                    
{
                        Console.WriteLine(
"Please give in the path to the PDF file.");
                    }

                }


                PDFParser pdfParser 
= new PDFParser();
                pdfParser.ExtractText(file, Path.GetFileNameWithoutExtension(file)
+".txt");
            }

            
catch (Exception exc)
            
{
                Console.WriteLine(exc);
            }

        }


        
static void DisplayUsage()
        
{
            Console.WriteLine();
            Console.WriteLine(
"Usage:\tpdftotext FILE");
            Console.WriteLine();
            Console.WriteLine(
"\tFILE\t the path to the PDF file, it may be relative or absolute.");
            Console.WriteLine();
        }

    }

}


问题,不支持中文,没有布局,仅仅是把每页的所以文字抽取出来,如果想真正实现PDFtoTxt,仍然有好多路要走,但毕竟是个好的开始。

from http://www.codeproject.com/useritems/PDFToText.asp
posted on 2006-06-16 07:26 RubyPDF 阅读(3302) 评论(8)  编辑 收藏 所属分类: iTextSharp(iText#)

  回复  引用  查看    
2006-06-16 08:14 | 自適應軟件......
暈,這是個開源的,早就Release出來了.不過,有點遺憾的是,WritePDF的時候,換行的時候,不能判斷行首位字母的時候,自動換行!

另外,把文字從PDF抽出來的時候,怎麼不支持中文呢?不解.還有,既然你已經把文字抽出來了,為甚麼不能實現PDFtoTxt呢,你直接寫到Txt,或者Word不就行了?
  回复  引用  查看    
2006-06-16 09:17 | HardRock
这个不是软件,只是一段应用代码,而且也不是我写的,至于为什么不支持中文,这个就不想多说了,至少目前我还没有能力实现它。
关于你说的首位字母的问题,这个有人已经实现,但不愿意公布方法,我也没有研究过,一是能力问题,另外一个是因为写PDF不是我的研究重点。
PDFToText涉及很多问题的,有兴趣你可以看看XPDF的一个工具就比较清楚了。
  回复  引用  查看    
2006-07-20 11:46 | zwg51666 [未注册用户]
谢谢HardRock ,那多页tif怎么处理,我现在每次只能得到第一页,谢谢
  回复  引用  查看    
2006-07-20 11:55 | HardRock
@zwg51666
你在搞什么?再把一个问题发几遍或者到处乱发,我删除了!
  回复  引用  查看    
2006-07-20 12:01 | zwg51666 [未注册用户]
你好,你看下我的代码,为什么,他还是裁图了,我想得到缩放的效果,
if (tif1.ScaledWidth>760)
{
float tempin=(760/tif1.ScaledWidth);
tempin*=100;
this.textBox1.Text=tempin.ToString();
tif1.ScalePercent(tempin);

}
谢谢
  回复  引用  查看    
2006-07-20 18:13 | zwg51666 [未注册用户]
我的问题解决了,谢
  回复  引用  查看    
2007-03-07 16:40 | minghong [未注册用户]
用xpdf(http://www.foolabs.com/xpdf/about.html)中的pdftotext,加上enc就可以抽出中文了,例如
pdftotext -layout -enc UTF-8 test.pdf
  回复  引用  查看    
2007-10-11 05:06 | Ottoniel [未注册用户]
Esta bien, pero no entiendo nada :D