pptx 文件的文本内容

2018 年写的，有许多不足之处，仅供参考。
office 文件格式说明，参见：https://learn.microsoft.com/en-us/openspecs/office_file_formats/MS-OFFFFLP/8aea05e3-8c1e-4a9a-9614-31f71e679456

点击查看代码

unit u_WordFile;

interface

uses
  u_CompoundBinaryFile, u_DirectoryEntry, Classes, u_Helper, Windows, u_Pcd,
  SysUtils, StrUtils;

const
  //存储于FIB中的word文档标识
  WORD_IDENT = $A5EC;

type
  {Word文件 TODO 或许需要处理文档语言，与编码的情况}
  TWordFile = class(TCompoundBinaryFile)
  private
    //Word格式的版本
    FNFib: Word;
    //是否使用1Table
    FUsed1Table: Boolean;
    //标识文档使用的语言
    FLidFE: Word;
    //文档中正文的总字数
    FCcpText: Integer;
    //文档中页脚的总字数
    FCcpFtn: Integer;
    //文档中页眉的总字数
    FCcpHdd: Integer;
    //文档中批注的总字数
    FCcpAtn: Integer;
    //文档中尾注的总字数
    FCcpEdn: Integer;
    //文档中文本框的总字数
    FCcpTxbx: Integer;
    //文档中页眉文本框的总字数
    FCcpHdrTxbx: Integer;

    //Clx在Table Stream中的偏移
    FFcClx: DWORD;
    //Clx的大小
    FLcbClx: DWORD;

    //该列表存储Pcd结构关联的文本的长度
    FTextLenList: TList;
    //Pcd结构列表
    FPcdList: TList;
    //文本内容
    FTextContent: WideString;
    FParagraphText: WideString;
    FFootnoteText: WideString;
    FHeaderText: WideString;
    FCommentText: WideString;
    FEndnoteText: WideString;
    FTextboxText: WideString;
    FHeaderTextboxText: WideString;
    
    procedure ReadWordDocument();
    procedure ReadFibBase(Stream: TStream);
    procedure ReadFibRgW97(Stream: TStream);
    procedure ReadFibRgLw97(Stream: TStream);
    procedure ReadFibRgFcLcb(Stream: TStream);
    procedure ReadTableStream();
    procedure ReadText();
    procedure SplitText();
    procedure ClearPcdList();
  protected
    procedure ReadContent(); override;
  public
    destructor Destroy(); override;
    function GetTextContent(): WideString; override;
    function FindText(Text: WideString): Boolean; override;
  end;

implementation

{ TWordFile }

destructor TWordFile.Destroy();
begin
  if FTextLenList <> nil then
    FreeAndNil(FTextLenList);
  if FPcdList <> nil then
  begin
    ClearPcdList();
    FreeAndNil(FPcdList);
  end;
  inherited;
end;

procedure TWordFile.ReadContent;
begin
  inherited;
  ReadWordDocument();
end;

procedure TWordFile.ReadWordDocument;
var
  entry: TDirectoryEntry;
begin
  entry := FRootDirEntry.FindChild('WordDocument');
  ReadDirEntryData(entry);
  entry.DataStream.Seek(0, soFromBeginning);
  ReadFibBase(entry.DataStream);
  ReadFibRgW97(entry.DataStream);
  ReadFibRgLw97(entry.DataStream);
  ReadFibRgFcLcb(entry.DataStream);
  ReadTableStream();
  ReadText();
  FilterSpecialChar(FTextContent);
  //SplitText();
end;

{读取FibBase结构的数据}
procedure TWordFile.ReadFibBase(Stream: TStream);
var
  wordIdent: Word;
  flags: Word;
  encrypted: Boolean;
begin
  Stream.ReadBuffer(wordIdent, 2);
  if wordIdent <> WORD_IDENT then
  begin
    raise CompoundFileException.Create(FFileName + ' 不是doc文件.');
  end;
  
  Stream.ReadBuffer(FNFib, 2);

  Stream.Seek(6, soFromCurrent);
  Stream.ReadBuffer(flags, 2);
  encrypted := IsFlagSet(flags, 9);
  if encrypted then
    raise CompoundFileException.Create(FFileName + ' 已被加密.');
  FUsed1Table := IsFlagSet(flags, 10);

  Stream.Seek(20, soFromCurrent);
end;

{读取FibRgW97结构的数据}
procedure TWordFile.ReadFibRgW97(Stream: TStream);
var
  //FibRgW97所包含的Word字段的个数
  csw: Word;
begin
  Stream.ReadBuffer(csw, 2);

  Stream.Seek(26, soFromCurrent);
  Stream.ReadBuffer(FLidFE, 2);
  if csw > 14 then
  begin
    Stream.Seek((csw - 14) * 2, soFromCurrent);
  end;
end;

{读取FibRgLw97结构的数据}
procedure TWordFile.ReadFibRgLw97(Stream: TStream);
var
  //FibRgLw97所包含的DWORD字段的个数
  cslw: Word;
begin
  Stream.ReadBuffer(cslw, 2);
  Stream.Seek(12, soFromCurrent);
  Stream.ReadBuffer(FCcpText, 4);
  Stream.ReadBuffer(FCcpFtn, 4);
  Stream.ReadBuffer(FCcpHdd, 4);
  Stream.Seek(4, soFromCurrent);
  Stream.ReadBuffer(FCcpAtn, 4);
  Stream.ReadBuffer(FCcpEdn, 4);
  Stream.ReadBuffer(FCcpTxbx, 4);
  Stream.ReadBuffer(FCcpHdrTxbx, 4);

  Stream.Seek(cslw * 4 - 44, soFromCurrent);
end;

{读取FibRgFcLcb结构的数据}
procedure TWordFile.ReadFibRgFcLcb(Stream: TStream);
var
  //FibRgFcLcb所包含的无符号64位字段的个数
  cbRgFcLcb: Word;
begin
  Stream.ReadBuffer(cbRgFcLcb, 2);
  Stream.Seek(66 * 4, soFromCurrent);
  Stream.ReadBuffer(FFcClx, 4);
  Stream.ReadBuffer(FLcbClx, 4);
end;

{读取Table中的数据}
procedure TWordFile.ReadTableStream();
var
  entry: TDirectoryEntry;
  stream: TStream;
  //clx结构位于数据流的位置
  clxOffset: Int64;
  //clx结构在数据流的结束位置
  clxEnd: Int64;
  //标识RgPrc结构或Pcdt结构 
  clxt: Byte;
  //Prc结构包含的PrcData结构下的GrpPrl结构的长度 
  cbGrpprl : SmallInt;
  //PlcPcd结构体列表的大小
  lcb: DWORD;
  pcdCount: DWORD;
  i: DWORD;
  textStartPoint: DWORD;
  textEndPoint: DWORD;
  info: Word;
  fc: DWORD;
  prm: Word;
  pcd: TPcd;  
begin
  if FUsed1Table then
    entry := FRootDirEntry.FindChild('1Table')
  else
    entry := FRootDirEntry.FindChild('0Table');
  ReadDirEntryData(entry);
  stream := entry.DataStream;

  clxOffset := FFcClx;
  clxEnd := clxOffset + FLcbClx;
  stream.Seek(clxOffset, soFromBeginning);

  stream.ReadBuffer(clxt, 1);
  cbGrpprl := 0;
  //跳过Prc结构
  while (clxt = 1) and (stream.Position < clxEnd) do
  begin
    stream.ReadBuffer(cbGrpprl , 2);
    stream.Seek(cbGrpprl , soFromCurrent);
    stream.ReadBuffer(clxt, 1);
  end;

  stream.ReadBuffer(lcb, 4);
  pcdCount := (lcb - 4) div 12;

  FTextLenList := TList.Create;
  for i:=1 to pcdCount do
  begin
    stream.ReadBuffer(textStartPoint, 4);
    stream.ReadBuffer(textEndPoint, 4);
    FTextLenList.Add(Pointer(textEndPoint - textStartPoint));
    stream.Seek(-4, soFromCurrent);
  end;
  stream.Seek(4, soFromCurrent);
  FPcdList := TList.Create;
  for i:=1 to pcdCount do
  begin
    stream.ReadBuffer(info, 2);
    stream.ReadBuffer(fc, 4);
    stream.ReadBuffer(prm, 2);
    pcd := TPcd.Create(info, fc, prm);
    FPcdList.Add(pcd);
  end;
end;

procedure TWordFile.ReadText();
var
  entry: TDirectoryEntry;
  stream: TStream;

  textLen: DWORD;
  i, j: DWORD;
  pcd: TPcd;
  len: DWORD;
  wideStr: WideString;
  ansiStr: AnsiString;
begin
  entry := FRootDirEntry.FindChild('WordDocument');
  stream := entry.DataStream;
  stream.Seek(0, soFromBeginning);

  textLen := 0;
  j := FPcdList.Count - 1;
  for i:=0 to j do
  begin
    pcd := TPcd(FPcdList.Items[i]);
    stream.Seek(pcd.TextLocation, soFromBeginning);
    len := DWORD(FTextLenList.Items[i]);
    if pcd.Unicode then
    begin
      SetLength(wideStr, len);
      stream.ReadBuffer(wideStr[1], len * 2);
    end
    else
    begin
      SetLength(ansiStr, len);
      stream.ReadBuffer(ansiStr[1], len);
      wideStr := ansiStr;
    end;
    SetLength(FTextContent, textLen + len);
    Move(wideStr[1], FTextContent[textLen + 1], len * 2);
    Inc(textLen, len);
  end;
end;

procedure TWordFile.SplitText();
var
  paragraphEnd,
  footnoteEnd,
  headerEnd,
  commentEnd,
  endnoteEnd,
  textboxEnd: Integer;
begin
  paragraphEnd := FCcpText;
  footnoteEnd := paragraphEnd + FCcpFtn;
  headerEnd := footnoteEnd + FCcpHdd;
  commentEnd := headerEnd + FCcpAtn;
  endnoteEnd := commentEnd + FCcpEdn;
  textboxEnd := endnoteEnd + FCcpTxbx;

  FParagraphText := MidStr(FTextContent, 0, paragraphEnd);
  FilterSpecialChar(FParagraphText);

  FFootnoteText := MidStr(FTextContent, paragraphEnd, FCcpFtn);
  FilterSpecialChar(FFootnoteText);

  FHeaderText := MidStr(FTextContent, footnoteEnd, FCcpHdd);
  FilterSpecialChar(FHeaderText);

  FCommentText := MidStr(FTextContent, headerEnd, FCcpAtn);
  FilterSpecialChar(FCommentText);

  FEndnoteText := MidStr(FTextContent, commentEnd, FCcpEdn);
  FilterSpecialChar(FEndnoteText);

  FTextboxText := MidStr(FTextContent, endnoteEnd, FCcpTxbx);
  FilterSpecialChar(FTextboxText);

  FHeaderTextboxText := MidStr(FTextContent, textboxEnd, FCcpHdrTxbx);
  FilterSpecialChar(FHeaderTextboxText);
end;

procedure TWordFile.ClearPcdList;
var
  i, j: Integer;
begin
  j := FPcdList.Count - 1;
  for i:=0 to j do
  begin
    TObject(FPcdList.Items[i]).Free;
  end;
  FPcdList.Clear;
end;

function TWordFile.GetTextContent: WideString;
begin
  Result := FTextContent;
end;

function TWordFile.FindText(Text: WideString): Boolean;
begin
  Result := Pos(Text, FTextContent) > 0;
end;

end.

点击查看代码

unit u_WordOOXmlFile;

interface

uses
  u_OfficeOpenXmlFile, NativeXml, SysUtils;

type
  TWordOOXmlFile = class(TOfficeOpenXmlFile)
  private
    FTextContent: WideString;
    procedure ReadText(Node: TXmlNode);
  protected
    procedure ReadContent(); override;
  public
    function GetTextContent(): WideString; override;
    function FindText(Text: WideString): Boolean; override;
  end;

implementation

{ TWordOOXmlFile }

function TWordOOXmlFile.GetTextContent: WideString;
begin
  Result := FTextContent;
end;

procedure TWordOOXmlFile.ReadContent;
var
  xml: TNativeXml;
begin
  inherited;
  xml := GetXml('document.xml');
  try
    if (xml <> nil) and (xml.Root <> nil) then
    begin
      ReadText(xml.Root);
    end
    else
    begin
      raise OOXmlException.Create(FFileName + ' 文件已损坏.');
    end;
  finally
    if xml <> nil then
      xml.Free;
  end;
end;

procedure TWordOOXmlFile.ReadText(Node: TXmlNode);
var
  i, j: Integer;
  childNode: TXmlNode;
  text: WideString;
  oldLen: Integer;
  space: string;
begin
  j := Node.NodeCount - 1;
  for i:=0 to j do
  begin
    childNode := Node.Nodes[i];
    if SameText(childNode.Name, 'w:t') then
    begin
      text := childNode.ValueAsWidestring;
      oldLen := Length(FTextContent);
      SetLength(FTextContent, oldLen + Length(text));
      Move(text[1], FTextContent[oldLen + 1], Length(text) * 2);
      if childNode.HasAttribute('xml:space') then
      begin
        space := childNode.AttributeByName['xml:space'];
        if space = 'preserve' then
        begin
          SetLength(FTextContent, Length(FTextContent) + 1);
          FTextContent[Length(FTextContent)] := ' ';
        end;
      end;
    end
    else if SameText(childNode.Name, 'w:cr') or SameText(childNode.Name, 'w:br')  then
    begin
      SetLength(FTextContent, Length(FTextContent) + 2);
      FTextContent[Length(FTextContent) - 1] := #13;
      FTextContent[Length(FTextContent)] := #10;
    end
    else if SameText(childNode.Name, 'w:tab') then
    begin
      SetLength(FTextContent, Length(FTextContent) + 1);
      FTextContent[Length(FTextContent)] := #9;
    end
    else if SameText(childNode.Name, 'w:p') then
    begin
      ReadText(childNode);
      SetLength(FTextContent, Length(FTextContent) + 2);
      FTextContent[Length(FTextContent) - 1] := #13;
      FTextContent[Length(FTextContent)] := #10;
    end
    else
    begin
      ReadText(childNode);
    end;
  end;
end;

function TWordOOXmlFile.FindText(Text: WideString): Boolean;
begin
  Result := Pos(Text, FTextContent) > 0;
end;

end.

点击查看代码

unit u_PowerPointFile;

interface

uses
  u_CompoundBinaryFile, Windows, Classes, u_DirectoryEntry, u_Helper;

const
  //标识记录是一个容器记录
  CONTAINER = $F;

type
  //记录类型
  TRecordType = (Unknown = 0, SlideListWithText = $0FF0, OfficeArtRecord = $F00D,
    HeadersFooters = $0FD9, TextCharsAtom = $0FA0, TextBytesAtom = $0FA8,
    CString = $0FBA);

  {PPT 文件}
  TPowerPointFile = class(TCompoundBinaryFile)
  private
    //文本内容
    FTextContent: WideString;

    procedure ReadRecords(Stream: TStream);
  protected
    procedure ReadContent(); override;
  public
    function GetTextContent(): WideString; override;
    function FindText(Text: WideString): Boolean; override;
  end;

implementation

{ TPowerPointFile }

function TPowerPointFile.GetTextContent: WideString;
begin
  Result := FTextContent;
end;

procedure TPowerPointFile.ReadContent;
var
  entry: TDirectoryEntry;
begin
  inherited;
  entry := FRootDirEntry.FindChild('PowerPoint Document');
  ReadDirEntryData(entry);
  entry.DataStream.Seek(0, soFromBeginning);
  ReadRecords(entry.DataStream);
end;

{读取PowerPoint Document数据流中的记录}
procedure TPowerPointFile.ReadRecords(Stream: TStream);

  procedure ReadRecord(ParentRecordType: Word);
  var
    value: Word;
    //record版本
    version: Word;
    //该值的意义取决于recordType
    //instance: Word;
    //记录的类型
    recordType: Word;
    //记录的长度
    recordLen: DWORD;
    recordOffset: Int64;
    wideStr: WideString;
    ansiStr: AnsiString;
    textLen: Integer;
  begin
    Stream.ReadBuffer(value, 2);
    version := value and $F;
    //instance := value and $FFF0;
    Stream.ReadBuffer(recordType, 2);
    Stream.ReadBuffer(recordLen, 4);
    recordOffset := Stream.Position;

    if version = CONTAINER then
    begin
      //该记录是一个容器，递归读取其下的记录
      while Stream.Position < recordOffset + recordLen do
        ReadRecord(recordType);
    end
    else if (ParentRecordType = Ord(OfficeArtRecord)) or
      (ParentRecordType = Ord(SlideListWithText)) or
      (ParentRecordType = Ord(HeadersFooters)) then
    begin
      {OfficeArtRecord/SlideListWithText/HeadersFooters记录的
      TextCharsAtom/CString/TextBytesAtom子记录中记录的数据就是我们所需要的
      ppt文本内容}
      if (recordType = Ord(TextCharsAtom)) or (recordType = Ord(CString)) then
      begin
        SetLength(wideStr, recordLen div 2);
        Stream.ReadBuffer(wideStr[1], recordLen);
        ConverLF(wideStr);
        //一个记录中存储的是一个幻灯片的文本，需要在其后添加换行符用以分隔
        wideStr := wideStr + #13#10;
        textLen := Length(FTextContent);
        SetLength(FTextContent, textLen + Length(wideStr));
        Move(wideStr[1], FTextContent[textLen + 1], Length(wideStr) * 2);
      end
      else if recordType = Ord(TextBytesAtom) then
      begin
        SetLength(ansiStr, recordLen);
        Stream.ReadBuffer(ansiStr[1], recordLen);
        wideStr := ansiStr;
        ConverLF(wideStr);
        //一个记录中存储的是一个幻灯片的文本，需要在其后添加换行符用以分隔
        wideStr := wideStr + #13#10;
        textLen := Length(FTextContent);
        SetLength(FTextContent, textLen + Length(wideStr));
        Move(wideStr[1], FTextContent[textLen + 1], Length(wideStr) * 2);
      end
      else
        Stream.Seek(recordLen, soFromCurrent);
    end
    else
    begin
      Stream.Seek(recordLen, soFromCurrent);
    end;
  end;
begin
  while Stream.Position < Stream.Size do
  begin
    ReadRecord(0);
  end;
end;

function TPowerPointFile.FindText(Text: WideString): Boolean;
begin
  Result := Pos(Text, FTextContent) > 0;
end;

end.

点击查看代码

unit u_PowerPointOOXmlFile;

interface

uses
  u_OfficeOpenXmlFile, Classes, NativeXml, SysUtils;

type
  TPowerPointOOXmlFile = class(TOfficeOpenXmlFile)
  private
    FTextContent: WideString;
    procedure GetSlideFileIdx(var List: TList);
    procedure ReadSlideFile(FileIdx: Integer);
    procedure ReadText(Xml: TNativeXml); overload;
    procedure ReadText(Node: TXmlNode); overload;
  protected
    procedure ReadContent(); override;
  public
    function GetTextContent(): WideString; override;
    function FindText(Text: WideString): Boolean; override;  
  end;

{
a:p 为段落
a:t 为文本内容
a:br 为换行

a:tbl 为一个表格
a:tr 为一行表格
a:tc 为一个单元格
}

implementation

{ TPowerPointOOXmlFile }

function TPowerPointOOXmlFile.GetTextContent: WideString;
begin
  Result := FTextContent;
end;

procedure TPowerPointOOXmlFile.ReadContent;
var
  slideFileIdxList: TList;
  i, j: Integer;
  hasNextSlide: Boolean;
  slideNo: Integer;
  slideName: string;
  slideFileIdx: Integer;
begin
  inherited;
  slideFileIdxList := TList.Create;
  try
    GetSlideFileIdx(slideFileIdxList);
    {从第一个幻灯片文件开始，向后顺序读取文件，解析文本内容}
    hasNextSlide := slideFileIdxList.Count > 0;
    slideNo := 1;
    while hasNextSlide do
    begin
      hasNextSlide := False;
      slideName := 'slide' + IntToStr(slideNo) + '.xml';
      j := slideFileIdxList.Count - 1;
      for i:=0 to j do
      begin
        slideFileIdx := Integer(slideFileIdxList[i]);
        if SameText(FZip.Filename[slideFileIdx], slideName) then
        begin
          ReadSlideFile(slideFileIdx);
          slideFileIdxList.Delete(i);
          hasNextSlide := slideFileIdxList.Count > 0;
          Inc(slideNo);
          Break;
        end;
      end
    end;
  finally
    slideFileIdxList.Free;
  end;
end;

{获取幻灯片文件位于ZIP包中的下标，以列表的形式返回
**列表中的幻灯片文件是无序的}
procedure TPowerPointOOXmlFile.GetSlideFileIdx(var List: TList);
var
  i: Integer;
begin
  i := 0;
  while i < FZip.Count do
  begin
    if Pos('ppt\slides\slide', FZip.FullName[i]) > 0 then
    begin
      List.Add(Pointer(i));
    end;
    Inc(i);
  end;
end;

{读取幻灯片文件中的文本}
procedure TPowerPointOOXmlFile.ReadSlideFile(FileIdx: Integer);
var
  stream: TMemoryStream;
  xml: TNativeXml;
begin
  xml := nil;
  stream := TMemoryStream.Create;
  try
    FZip.UnZipToStreamByIndex(stream, FileIdx);
    xml := TNativeXml.Create;
    xml.LoadFromStream(stream);
    ReadText(xml);
  finally
    if xml <> nil then
      xml.Free;
    if stream <> nil then
      stream.free;
  end;
end;

{解析xml，从中抽取去文本}
procedure TPowerPointOOXmlFile.ReadText(Xml: TNativeXml);
begin
  if (xml <> nil) and (xml.Root <> nil) then
  begin
    ReadText(xml.Root);
  end
  else
  begin
    raise OOXmlException.Create(FFileName + ' 文件已损坏.');
  end;
end;

{解析xml节点，从中抽取去文本}
procedure TPowerPointOOXmlFile.ReadText(Node: TXmlNode);
var
  i, j: Integer;
  childNode: TXmlNode;
  text: WideString;
  oldLen: Integer;
begin
  j := Node.NodeCount - 1;
  for i:=0 to j do
  begin
    childNode := Node.Nodes[i];
    if SameText(childNode.Name, 'a:t') then
    begin
      text := childNode.ValueAsWidestring;
      oldLen := Length(FTextContent);
      SetLength(FTextContent, oldLen + Length(text));
      Move(text[1], FTextContent[oldLen + 1], Length(text) * 2);
    end
    else if SameText(childNode.Name, 'a:br')  then
    begin
      SetLength(FTextContent, Length(FTextContent) + 2);
      FTextContent[Length(FTextContent) - 1] := #13;
      FTextContent[Length(FTextContent)] := #10;
    end
    else if SameText(childNode.Name, 'a:t') then
    begin
      SetLength(FTextContent, Length(FTextContent) + 1);
      FTextContent[Length(FTextContent)] := #9;
    end
    else if SameText(childNode.Name, 'a:p') then
    begin
      ReadText(childNode);
      SetLength(FTextContent, Length(FTextContent) + 2);
      FTextContent[Length(FTextContent) - 1] := #13;
      FTextContent[Length(FTextContent)] := #10;
    end
    else
    begin
      ReadText(childNode);
    end;
  end;
end;

function TPowerPointOOXmlFile.FindText(Text: WideString): Boolean;
begin
  Result := Pos(Text, FTextContent) > 0;
end;

end.

点击查看代码

unit u_CompoundBinaryFile;

interface

uses
  u_IFile, Classes, SysUtils, Windows, u_DirectoryEntry, u_Helper;

const
  //复合文件头大小
  HEADER_SIZE = 512;
  //目录条目大小
  DIR_ENTRY_SIZE = 128;
  //最大的常规扇区编号
  MAX_REG_SECT = $FFFFFFFA;
  //保留的扇区编号
  RESERVE_SECT = $FFFFFFFB;
  //标识FAT中的DIFAT扇区
  DIF_SECT = $FFFFFFFC;
  //标识FAT中的FAT扇区
  FAT_SECT = $FFFFFFFD;
  //标识扇区链结束
  END_OF_CHAIN = $FFFFFFFE;
  //标识扇区未使用
  FREE_SECTOR = $FFFFFFFF;
  //标识目录条目不存在
  NOSTREAM = $FFFFFFFF;

  //未知的目录条目
  UNKNOWN_DIR_ENTRY = 0;
  //存储数据的目录条目
  STORAGE_DIR_ENTRY = 1;
  //存储数据流的目录条目?
  STREAM_DIR_ENTRY = 2;
  //根目录条目
  ROOT_DIR_ENTRY = 5;

type
  {复合文件异常}
  CompoundFileException = class(Exception);

  {复合二进制文件}
  TCompoundBinaryFile = class(TInterfacedObject, IFile)
  private
    procedure ReadHeader();
    procedure ReadFATSectors();
    procedure ReadFirst109FatSectors();
    procedure ReadFATSectorsFromDIFAT();
    procedure ReadDirectory();
    function GetDirectoryEntry(EntryIdx: DWORD; out leftSiblingEntryIdx,
      rightSiblingEntryIdx, childEntryIdx: DWORD): TDirectoryEntry;
    procedure ReadDirectoryEntry(Parent: TDirectoryEntry; EntryIdx: DWORD);
    function GetDirectoryEntryPos(EntryIdx: DWORD): Int64;
    procedure ReadMiniFATSectors();
    procedure ReadMiniSectors();
    function GetDirEntryDataSectorChain(DirEntry: TDirectoryEntry): TList;
  protected
    FFileName: string;
    FFileStream: TFileStream;
    //扇区大小
    FSectorSize: Word;
    //小扇区大小
    FMiniSectorSize: Word;
    //FAT扇区数量
    FFatSectorCount: DWORD;
    //第一个目录条目所在的扇区下标
    FFirstDirSectorNo: DWORD;
    //小数据的截止大小（小于该值），小数据存储于小扇区中
    FMiniStreamCutoffSize: DWORD;
    //第一个Mini-FAT扇区编号
    FFirstMiniFATSectorNo: DWORD;
    //Mini-FAT扇区数量
    FMiniFATCount: DWORD;
    //第一个DIFAT扇区的编号
    FFirstDIFATSectorNo: DWORD;
    //DIFAT扇区数量
    FDIFATCount: DWORD;

    //文件分配表扇区列表
    FFATSectorList: TList;
    //目录信息的扇区链
    FDirSectorChain: TList;
    //根目录条目
    FRootDirEntry: TDirectoryEntry;
    //小文件分配表扇区列表
    FMiniFatSectorList: TList;
    //小扇区列表
    FMiniSectorList: TList;

    function GetNormalSectorOffset(SectorNo: DWORD): Int64;
    function GetNextNormalSectorNo(SectorNo: DWORD): DWORD;
    function GetNextMiniStreamNo(MiniStreamNo: DWORD): DWORD;
    function GetMiniStreamOffset(MiniStreamNo: DWORD): Int64;
    procedure ReadDirEntryData(DirEntry: TDirectoryEntry);
    procedure ReadContent(); virtual; abstract;
  public
    constructor Create(FileName: string);
    destructor Destroy(); override;
    function GetObjectInstance(): TObject;
    function GetTextContent(): WideString; virtual; abstract;
    function FindText(Text: WideString): Boolean; virtual; abstract;
  end;

implementation

{ TCompoundBinaryFile }

constructor TCompoundBinaryFile.Create(FileName: string);
begin
  FFileName := FileName;
  FFileStream := TFileStream.Create(FileName, fmOpenRead
    or fmShareDenyWrite);
  ReadHeader();
  ReadFATSectors();
  ReadDirectory();
  ReadMiniFATSectors();
  ReadMiniSectors();
  ReadContent();
end;

destructor TCompoundBinaryFile.Destroy;
begin
  if FFileStream <> nil then
    FreeAndNil(FFileStream);
  if FFATSectorList <> nil then
    FreeAndNil(FFATSectorList);
  if FDirSectorChain <> nil then
    FreeAndNil(FDirSectorChain);
  if FMiniFatSectorList <> nil then
    FreeAndNil(FMiniFatSectorList);
  if FMiniSectorList <> nil then
    FreeAndNil(FMiniSectorList);
  if FRootDirEntry <> nil then
    FreeAndNil(FRootDirEntry);
  inherited;
end;

{读取复合文件的头部信息}
procedure TCompoundBinaryFile.ReadHeader;
  {判断文件是否为office文件}
  function OfficeFile(): Boolean;
  var
    buf: array of Byte;
  begin
    SetLength(buf, 8);
    FFileStream.ReadBuffer(buf[0], 8);
    Result := (buf[0] = $D0) and
              (buf[1] = $CF) and
              (buf[2] = $11) and
              (buf[3] = $E0) and
              (buf[4] = $A1) and
              (buf[5] = $B1) and
              (buf[6] = $1A) and
              (buf[7] = $E1);
  end;
var
  //字节顺序
  byteOrder: Word;
begin             
  if not OfficeFile() then
    raise CompoundFileException.Create(FFileName + ' 不是office文件');

  FFileStream.Seek(20, soFromCurrent);
  FFileStream.ReadBuffer(byteOrder, 2);
  if byteOrder <> $FFFE then
    raise CompoundFileException.Create(FFileName + ' 不支持该版本的office文件. 原因：无法解析文件字节顺序');

  FFileStream.ReadBuffer(FSectorSize, 2);
  FSectorSize := Power(2, FSectorSize);

  FFileStream.ReadBuffer(FMiniSectorSize, 2);
  FMiniSectorSize := Power(2, FMiniSectorSize);

  FFileStream.Seek(10, soFromCurrent);
  FFileStream.ReadBuffer(FFatSectorCount, 4);
  FFileStream.ReadBuffer(FFirstDirSectorNo, 4);

  FFileStream.Seek(4, soFromCurrent);
  FFileStream.ReadBuffer(FMiniStreamCutoffSize, 4);
  FFileStream.ReadBuffer(FFirstMiniFATSectorNo, 4);
  FFileStream.ReadBuffer(FMiniFATCount, 4);

  FFileStream.ReadBuffer(FFirstDIFATSectorNo, 4);
  FFileStream.ReadBuffer(FDIFATCount, 4);
end;

{读取FAT扇区列表}
procedure TCompoundBinaryFile.ReadFATSectors;
begin
  FFATSectorList := TList.Create;
  if FFatSectorCount > 0 then
  begin
    ReadFirst109FatSectors();
    ReadFATSectorsFromDIFAT();
  end;
end;

{读取header区域的109个FAT}
procedure TCompoundBinaryFile.ReadFirst109FatSectors();
var
  i, j: Integer;
  sectorNo: DWORD;
begin
  j := 109;
  for i:=1 to j do
  begin
    FFileStream.ReadBuffer(sectorNo, 4);
    if sectorNo = FREE_SECTOR then
      Break;
    FFATSectorList.Add(Pointer(sectorNo));
  end;
end;

{从DIFAT扇区读取FAT扇区}
procedure TCompoundBinaryFile.ReadFATSectorsFromDIFAT();
var
  DIFATSectorNo: DWORD;
  offset: Int64;
  i, j: Integer;
  FATSectorNo: DWORD;
begin
  //判断是否存在DIFAT
  if FDIFATCount > 0 then
  begin
    DIFATSectorNo := FFirstDIFATSectorNo;
    //判断DIFAT扇区链是否结束
    while DIFATSectorNo <> END_OF_CHAIN do
    begin
      offset := GetNormalSectorOffset(DIFATSectorNo);
      FFileStream.Seek(offset, soFromBeginning);
      j := 127;
      for i:=1 to j do
      begin
        FFileStream.ReadBuffer(FATSectorNo, 4);
        if FATSectorNo = FREE_SECTOR then
          Exit;
        FFATSectorList.Add(Pointer(FATSectorNo));
      end;
      FFileStream.ReadBuffer(DIFATSectorNo, 4);
    end;
  end;
end;

{读取目录信息}
procedure TCompoundBinaryFile.ReadDirectory();
var
  serctorNo: DWORD;
  leftSiblingEntryIdx: DWORD;
  rightSiblingEntryIdx: DWORD;
  childEntryIdx: DWORD;  
begin
  FDirSectorChain := TList.Create;
  if FFirstDirSectorNo >= RESERVE_SECT then
    Exit;
  serctorNo := FFirstDirSectorNo;
  while serctorNo  <> END_OF_CHAIN do
  begin
    FDirSectorChain.Add(Pointer(serctorNo));
    serctorNo := GetNextNormalSectorNo(serctorNo);
  end;
  FRootDirEntry := GetDirectoryEntry(0, leftSiblingEntryIdx,
    rightSiblingEntryIdx, childEntryIdx);
  ReadDirectoryEntry(FRootDirEntry, childEntryIdx);
end;

{读取MiniFAT列表}
procedure TCompoundBinaryFile.ReadMiniFATSectors();
var
  serctorNo: DWORD;
begin
  FMiniFatSectorList := TList.Create;
  if FMiniFATCount > 0 then
  begin
    serctorNo := FFirstMiniFATSectorNo;
    while serctorNo <> END_OF_CHAIN do
    begin
      FMiniFatSectorList.Add(Pointer(serctorNo));
      serctorNo := GetNextNormalSectorNo(serctorNo);
    end;
  end;
end;

{读取小扇区列表}
procedure TCompoundBinaryFile.ReadMiniSectors();
var
  serctorNo: DWORD;
begin
  FMiniSectorList := TList.Create;
  if FMiniFATCount > 0 then
  begin
    serctorNo := FRootDirEntry.DataSectorNo;                      
    while serctorNo <> END_OF_CHAIN do
    begin
      FMiniSectorList.Add(Pointer(serctorNo));
      serctorNo := GetNextNormalSectorNo(serctorNo);
    end;
  end;
end;


{获取指定扇区编号位于文件中的位置}
function TCompoundBinaryFile.GetNormalSectorOffset(SectorNo: DWORD): Int64;
begin
  Result := HEADER_SIZE + FSectorSize * SectorNo;
end;

{获取指定扇区的扇区链的下一个扇区}
function TCompoundBinaryFile.GetNextNormalSectorNo(SectorNo: DWORD): DWORD;
var
  FATIndex: Integer;
  FATSerctorNo: DWORD;
  FATOffset: Int64;
begin
  {计算出扇区编号所对应的FAT}
  FATIndex := SectorNo div 128;
  FATSerctorNo := DWORD(FFATSectorList.Items[FATIndex]);
  {从FAT中查询出下一个扇区的编号}
  FATOffset := GetNormalSectorOffset(FATSerctorNo);
  FFileStream.Seek(FATOffset + 4 * (SectorNo mod 128), soFromBeginning);
  FFileStream.ReadBuffer(Result, 4);
end;

{获取指定小扇区的扇区链的下一个小扇区}
function TCompoundBinaryFile.GetNextMiniStreamNo(MiniStreamNo: DWORD): DWORD;
var
  sectorNo: DWORD;
begin
  sectorNo := DWORD(FMiniFatSectorList.Items[MiniStreamNo div 128]);
  FFileStream.Seek(GetNormalSectorOffset(sectorNo) + 4 * (MiniStreamNo mod 128),
    soFromBeginning);
  FFileStream.ReadBuffer(Result, 4);
end;

{获取指定小数据位于文件中的位置}
function TCompoundBinaryFile.GetMiniStreamOffset(MiniStreamNo: DWORD): Int64;
var
  sectorNo: DWORD;
begin
  sectorNo := DWORD(FMiniSectorList.Items[MiniStreamNo * FMiniSectorSize div FSectorSize]);
  Result := GetNormalSectorOffset(sectorNo) + MiniStreamNo * FMiniSectorSize mod FSectorSize;
end;

{从数据流中解析出指定下标的目录条目}
function TCompoundBinaryFile.GetDirectoryEntry(EntryIdx: DWORD; out leftSiblingEntryIdx,
  rightSiblingEntryIdx, childEntryIdx: DWORD): TDirectoryEntry;
var
  name: WideString;
  nameLen: Word;
  entryType: Byte;
  dataSectorNo: DWORD;
  dataLen: DWORD;
begin
  Result := nil;
  leftSiblingEntryIdx := NOSTREAM;
  rightSiblingEntryIdx := NOSTREAM;
  childEntryIdx := NOSTREAM;

  FFileStream.Seek(GetDirectoryEntryPos(EntryIdx), soFromBeginning);

  SetLength(name, 32);
  FFileStream.ReadBuffer(name[1], 64);
  FFileStream.ReadBuffer(nameLen, 2);
  //取出来的name是为Unicode编码，nameLen为其字节长度，需要转为字符长度(并减去#0结束字符)
  nameLen := nameLen div 2 - 1;
  SetLength(name, nameLen);

  FFileStream.ReadBuffer(entryType, 1);
  if not (entryType in [UNKNOWN_DIR_ENTRY, STORAGE_DIR_ENTRY, STREAM_DIR_ENTRY,
    ROOT_DIR_ENTRY]) then
    Exit;

  FFileStream.Seek(1, soFromCurrent);
  FFileStream.ReadBuffer(leftSiblingEntryIdx, 4);
  FFileStream.ReadBuffer(rightSiblingEntryIdx, 4);
  FFileStream.ReadBuffer(childEntryIdx, 4);

  FFileStream.Seek(36, soFromCurrent);
  FFileStream.ReadBuffer(dataSectorNo, 4);
  FFileStream.ReadBuffer(dataLen, 4);

  Result := TDirectoryEntry.Create(EntryIdx, name, entryType, dataSectorNo,
    dataLen);
end;

{递归读取给定目录条目的所有子条目}
procedure TCompoundBinaryFile.ReadDirectoryEntry(Parent: TDirectoryEntry; EntryIdx: DWORD);
var
  entry: TDirectoryEntry;
  leftSiblingEntryIdx: DWORD;
  rightSiblingEntryIdx: DWORD;
  childEntryIdx: DWORD;
begin
  entry := GetDirectoryEntry(EntryIdx, leftSiblingEntryIdx,
    rightSiblingEntryIdx, childEntryIdx);
  if entry = nil then
    Exit;
  Parent.AddChild(entry);

  if leftSiblingEntryIdx <> NOSTREAM then
    ReadDirectoryEntry(Parent, leftSiblingEntryIdx);
  if rightSiblingEntryIdx <> NOSTREAM then
    ReadDirectoryEntry(Parent, rightSiblingEntryIdx);
  if childEntryIdx <> NOSTREAM then
    ReadDirectoryEntry(entry, childEntryIdx);
end;

{获取指定下标的目录条目位于数据流的位置}
function TCompoundBinaryFile.GetDirectoryEntryPos(EntryIdx: DWORD): Int64;
var
  idx: Integer;
  serctorNo: DWORD;
begin
  //计算该条目所属的扇区位于扇区链中的位置
  idx := DIR_ENTRY_SIZE * EntryIdx div FSectorSize;
  serctorNo := DWORD(FDirSectorChain.Items[idx]);
  Result := GetNormalSectorOffset(serctorNo) + (DIR_ENTRY_SIZE * EntryIdx mod FSectorSize);
end;

{读取目录条目的数据流}
procedure TCompoundBinaryFile.ReadDirEntryData(DirEntry: TDirectoryEntry);
var
  serctorChain: TList;
  mStream: TMemoryStream;
  i, j: Integer;
  miniStreamNo: DWORD;
  sectorNo: DWORD;
  offset: Int64;
  dataLen: DWORD;
  readLen: DWORD;
  buff: array of Byte;
  exp: Exception;
begin
  serctorChain := nil;
  exp := nil;
  try
    try
      serctorChain := GetDirEntryDataSectorChain(DirEntry);
      mStream := TMemoryStream.Create;
      dataLen := DirEntry.DataLen;
      j := serctorChain.Count - 1;
      if DirEntry.DataLen >= FMiniStreamCutoffSize then
      begin
        SetLength(buff, FSectorSize);
        for i:=0 to j do
        begin
          sectorNo := DWORD(serctorChain.Items[i]);
          offset := GetNormalSectorOffset(sectorNo);
          FFileStream.Seek(offset, soFromBeginning);
          if FSectorSize < dataLen then
            readLen := FSectorSize
          else
            readLen := dataLen;
          Dec(dataLen, readLen);
          FFileStream.ReadBuffer(buff[0], readLen);
          mStream.WriteBuffer(buff[0], readLen);
        end;
      end
      else
      begin
        SetLength(buff, FMiniSectorSize);
        for i:=0 to j do
        begin
          miniStreamNo := DWORD(serctorChain.Items[i]);
          offset := GetMiniStreamOffset(miniStreamNo);
          FFileStream.Seek(offset, soFromBeginning);
          if FMiniSectorSize < dataLen then
            readLen := FMiniSectorSize
          else
            readLen := dataLen;
          Dec(dataLen, readLen);
          FFileStream.ReadBuffer(buff[0], readLen);
          mStream.WriteBuffer(buff[0], readLen);
        end;
      end;
      DirEntry.DataStream := mStream;
    except
      on e: Exception do
      begin
        exp := e;
      end;
    end;
  finally
    if serctorChain <> nil then
      serctorChain.Free;
  end;
  if exp <> nil then
    raise exp;
end;

{如果目录条目关联的数据是位于扇区链中，则返回扇区链，否则返回小数据链}
function TCompoundBinaryFile.GetDirEntryDataSectorChain(DirEntry: TDirectoryEntry): TList;
var
  sectorNo: DWORD;
begin
  Result := TList.Create;
  sectorNo := DirEntry.DataSectorNo;
  if DirEntry.DataLen >= FMiniStreamCutoffSize then
  begin
    while sectorNo <> END_OF_CHAIN do
    begin
      Result.Add(Pointer(sectorNo));
      sectorNo := GetNextNormalSectorNo(sectorNo);
    end;
  end
  else
  begin
    while sectorNo <> END_OF_CHAIN do
    begin
      Result.Add(Pointer(sectorNo));
      sectorNo := GetNextMiniStreamNo(sectorNo);
    end;
  end;
end;

function TCompoundBinaryFile.GetObjectInstance(): TObject;
begin
  Result := Self;
end;

end.

点击查看代码

unit u_DirectoryEntry;

interface

uses
  Windows, Classes, SysUtils;

type
  {目录条目}
  TDirectoryEntry = class(TObject)
  private
    //该条目的下标
    FIdx: DWORD;
    //名称
    FName: WideString;
    //类型
    FEntryType: Byte;
    //该条目所包含的数据的扇区下标
    FDataSectorNo: DWORD;
    //所包含数据的长度
    FDataLen: DWORD;
    //该条目的子条目列表
    FChildren: TList;
    //该条目关联的数据流
    FDataStream: TStream;
    function GetChild(Index: Integer): TDirectoryEntry;
    function GetChildCount(): Integer;
    procedure ClearChild();
  public
    constructor Create(Idx: DWORD; Name: WideString; EntryType: Byte;
      DataSectorNo: DWORD; DataLen: DWORD);
    destructor Destroy(); override;
    property Idx: DWORD read FIdx write FIdx;
    property Name: WideString read FName write FName;
    property EntryType: Byte read FEntryType write FEntryType;
    property DataSectorNo: DWORD read FDataSectorNo write FDataSectorNo;
    property DataLen: DWORD read FDataLen write FDataLen;
    property Children[Index: Integer]: TDirectoryEntry read GetChild;
    property ChildCount: Integer read GetChildCount;
    property DataStream: TStream read FDataStream write FDataStream;
    function AddChild(Child: TDirectoryEntry): Integer;
    function FindChild(Name: WideString): TDirectoryEntry;
  end;


implementation

{ TDirectoryEntry }

constructor TDirectoryEntry.Create(Idx: DWORD; Name: WideString; EntryType: Byte;
  DataSectorNo: DWORD; DataLen: DWORD);
begin
  FIdx := Idx;
  FName := Name;
  FEntryType := EntryType;
  FDataSectorNo := DataSectorNo;
  FDataLen := DataLen;

  FChildren := TList.Create;
end;

destructor TDirectoryEntry.Destroy();
begin
  if FChildren <> nil then
  begin
    ClearChild();
    FreeAndNil(FChildren);
  end;
  if FDataStream <> nil then
    FreeAndNil(FDataStream);
  inherited;
end;

function TDirectoryEntry.GetChild(Index: Integer): TDirectoryEntry;
begin
  Result := nil;
  if (Index > -1) and (Index < FChildren.Count) then
    Result := FChildren[Index];
end;

function TDirectoryEntry.AddChild(Child: TDirectoryEntry): Integer;
begin
  Result := FChildren.Add(Child);
end;

function TDirectoryEntry.GetChildCount: Integer;
begin
  Result := FChildren.Count;
end;

function TDirectoryEntry.FindChild(Name: WideString): TDirectoryEntry;
var
  i, j: Integer;
  entry: TDirectoryEntry;
begin
  Result := nil;
  j := FChildren.Count - 1;
  for i:=0 to j do
  begin
    entry := FChildren[i];
    if WideSameStr(entry.FName, Name) then
    begin
      Result := entry;
      Break;
    end;
  end;
end;

procedure TDirectoryEntry.ClearChild;
var
  i, j: Integer;
begin
  j := FChildren.Count - 1;
  for i:=0 to j do
  begin
    TObject(FChildren.Items[i]).Free;
  end;
  FChildren.Clear;
end;

end.

点击查看代码

unit u_OfficeOpenXmlFile;

interface

uses
  u_IFile, Forms, SysUtils, NativeXml, Classes, VCLZip;

type
  OOXmlException = class(Exception);

  TOfficeOpenXmlFile = class(TInterfacedObject, IFile)
  protected
    FFileName: string;
    FZip: TVCLZip;
    procedure ReadContent(); virtual; abstract;
    function GetXml(XmlName: string): TNativeXml;
  public
    constructor Create(FileName: string);
    destructor Destroy(); override;
    function GetObjectInstance(): TObject;
    function GetTextContent(): WideString; virtual; abstract;
    function FindText(Text: WideString): Boolean; virtual; abstract;
  end;

implementation

{ TOfficeOpenXmlFile }

constructor TOfficeOpenXmlFile.Create(FileName: string);
begin
  FFileName := FileName;
  FZip := TVCLZip.Create(nil);
  FZip.ZipName := FileName;
  FZip.ReadZip;
  ReadContent();
end;

destructor TOfficeOpenXmlFile.Destroy;
begin
  if FZip <>  nil then
    FreeAndNil(FZip);
  inherited;
end;

function TOfficeOpenXmlFile.GetObjectInstance: TObject;
begin
  Result := Self;
end;

{返回指定名称的XML}
function TOfficeOpenXmlFile.GetXml(XmlName: string): TNativeXml;
var
  i: Integer;
  stream: TMemoryStream;
begin
  Result := nil;
  i := 0;
  while i < FZip.Count do
  begin
    if SameText(ExtractFileName(FZip.Filename[i]), XmlName) then
    begin
      stream := TMemoryStream.Create;
      try
        FZip.UnZipToStreamByIndex(stream, i);
        Result := TNativeXml.Create;
        Result.LoadFromStream(stream);
      finally
        if stream <> nil then
          stream.free;
      end;
      Break;
    end;
    Inc(i);
  end;
end;

end.

点击查看代码

unit u_IFile;

interface

type
  IFile = interface
  ['{2A38E410-13FE-49A2-86EF-CAFE8BB60556}']
  //返回该接口所在的对象（delphi中接口与其所在对象的地址是不一样的，接口不能够直接转换为对象）
  function GetObjectInstance(): TObject;
  //返回文本内容
  function GetTextContent(): WideString;
  //查找文本
  function FindText(Text: WideString): Boolean;
  end;

implementation

end.

点击查看代码

unit u_Pcd;

interface

uses
  Windows;

type
  TPcd = class(TObject)
  private
    FTextLocation: DWORD;
    FUnicode: Boolean;
  public
    constructor Create(Info: Word; Fc: DWORD; Prm: Word);
    property TextLocation: DWORD read FTextLocation;
    property Unicode: Boolean read FUnicode;
  end;


implementation

{ TPcd }

constructor TPcd.Create(Info: Word; Fc: DWORD; Prm: Word);
begin
  FTextLocation := Fc and $3FFFFFFF;
  FUnicode := (Fc and $40000000) = 0;
  if not FUnicode then
    FTextLocation := FTextLocation div 2;
end;

end.

点击查看代码

{帮助方法单元}
unit u_Helper;

interface

uses
  Windows;

const
  {Word文件中使用的特殊Ascii字符}
  //单元格之间的分隔符，连续的两个代表下一行的单元格
  CELL_MARK =                           #07;
  //制表符
  TAB =                                 #09;
  BREAK_LINE =                          #11;
  PAGE_BREAK_OR_SECTION_MARK =          #12;
  //换行符
  LINE_BREAK =                          #13;
  CLUMN_BREAK =                         #14;
  //标识特殊标记开始
  FIELD_START =                         #19;
  //FIELD_START到FIELD_SEPERATOR之间的字串是控制用的（不顯示）
  FIELD_SEPERATOR =                     #20;
  //FIELD_SEPERATOR到FIELD_END之間的字串則是顯示用的
  FIELD_END =                           #21;
  NON_BREAKING_HYPHEN =                 #30;
  NON_REQUIRED_HYPHEN =                 #31;
  NON_BREAKING_SPACE  =                 #160;

  {计算BaseNum的平方值，指数为Exponent}
  function Power(BaseNum: Word; Exponent: Word): Word;

  {判断参数Value中FlagPos所指定的位是否为1}
  function IsFlagSet(Value: DWORD; FlagPos: Word): Boolean;

  {过滤office文本中的特殊字符}
  procedure FilterSpecialChar(var Source: WideString);

  {将office文件使用的换行符转换为delphi中使用的换行符}
  procedure ConverLF(var Source: WideString);

implementation

function Power(BaseNum: Word; Exponent: Word): Word;
var
  i: Integer;
begin
  Result := BaseNum;
  for i:=2 to Exponent do
  begin
    Result := Result * BaseNum;
  end;
end;

function IsFlagSet(Value: DWORD; FlagPos: Word): Boolean;
var
  num: Word;
begin
  Dec(FlagPos, 1);
  num := Power(2, FlagPos);
  Result := (Value and num) = num;
end;

procedure FilterSpecialChar(var Source: WideString);
var
  i: DWORD;
  len: DWORD;
  startIgnore: Boolean;
  startIgnoreIdx: DWORD;
begin
  startIgnore := False;
  startIgnoreIdx := 0;
  i := 0;
  len := Length(Source);
  while i < len do
  begin
    Inc(i);
    {过控制字符和通讯专用字符，除了Tab}
    if (Source[i] < #32) and (Source[i] <> #9) then
    begin
      if startIgnore then
      begin
        if Source[i] = FIELD_SEPERATOR then
        begin
          startIgnore := False;
          Delete(Source, startIgnoreIdx, i - startIgnoreIdx + 1);
          i := startIgnoreIdx - 1;
          len := Length(Source);
        end;
      end
      else
      begin
        if Source[i] = FIELD_START then
        begin
          {遇到office的特殊标记符号，记录下下标，之后遇到的任何字符都不再过滤，
          直到遇到FIELD_SEPERATOR后，将两个符号间的字符都删除}
          startIgnore := True;
          startIgnoreIdx := i;
        end
        else if Source[i] = LINE_BREAK then
        begin
          {office中的换行符为#13，这里需要替换为#13#10}
          Inc(len);
          SetLength(Source, len);
          //如果i之后还有字符，则将其向后移一位
          if i < len - 1 then
            Move(Source[i + 1], Source[i + 2], (len - i - 1) * 2);
          Source[i] := #13;
          Source[i + 1] := #10;
          Inc(i);
        end
        else if Source[i] = CELL_MARK then
        begin
          if (i < len) and (Source[i + 1] = CELL_MARK) then
          begin
            {连续两个CELL_MARK代表单元格换行，需要将这两个CELL_MARK符号替换为#13#10}
            Source[i] := #13;
            Source[i + 1] := #10;
            Inc(i);
          end
          else
          begin
            //相邻的单元格之间使用TAB分隔
            Source[i] := #9;
          end;
        end
        else
        begin
          {其它字符直接删除}
          Delete(Source, i, 1);
          Dec(i);
          Dec(len);
        end;
      end;
    end;
  end;
end;

procedure ConverLF(var Source: WideString);
var
  i: DWORD;
  len: DWORD;
begin
  i := 0;
  len := Length(Source);
  while i < len do
  begin
    Inc(i);
    if (Source[i] < #32) and (Source[i] <> #9) then
    begin
      if Source[i] = LINE_BREAK then
      begin
        {office中的换行符为#13，这里需要替换为#13#10}
        SetLength(Source, len + 1);
        //如果当前字符后面还有字符，则将后面的字符向后移一位
        if i < len then
          Move(Source[i + 1], Source[i + 2], (len - i) * 2);
        Source[i] := #13;
        Source[i + 1] := #10;
        Inc(i);
        Inc(len);
      end
      else
      begin
        Delete(Source, i, 1);
        Dec(i);
        Dec(len);
      end;
    end;
  end;
end;

end.

posted @ 2025-12-25 14:45 邓加领阅读(6) 评论(0) 收藏举报

刷新页面返回顶部

邓加领

不定期记录和分享部分个人经验，以证其道

delphi 解析提取 doc/docx/ppt/pptx 文件的文本内容

公告