使用NOPI读取Word、Excel文档内容

使用NOPI读取Excel的例子很多,读取Word的例子不多。

Excel的解析方式有多中,可以使用ODBC查询,把Excel作为一个数据集对待。也可以使用文档结构模型的方式进行解析,即解析Workbook(工作簿)、Sheet、Row、Column。

Word的解析比较复杂,因为Word的文档结构模型定义较为复杂。解析Word或者Excel,关键是理解Word、Excel的文档对象模型。

Word、Excel文档对象模型的解析,可以通过COM接口调用,此类方式使用较广。(可以录制宏代码,然后替换为对应的语言)

也可以使用XML模型解析,尤其是对于2007、2010版本的文档的解析。

  1 using NPOI.POIFS.FileSystem;
  2 using NPOI.SS.UserModel;
  3 using NPOI.XSSF.UserModel;
  4 using NPOI.XWPF.UserModel;
  5 using System;
  6 using System.Collections.Generic;
  7 using System.Configuration;
  8 using System.IO;
  9 using System.Text;
 10 
 11 namespace eyuan
 12 {
 13     public static class NOPIHandler
 14     {
 15         /// <summary>
 16         /// 
 17         /// </summary>
 18         /// <param name="fileName"></param>
 19         /// <returns></returns>
 20         public static List<List<List<string>>> ReadExcel(string fileName)
 21         {
 22             //打开Excel工作簿
 23             XSSFWorkbook hssfworkbook = null;
 24             try
 25             {
 26                 using (FileStream file = new FileStream(fileName, FileMode.Open, FileAccess.Read))
 27                 {
 28                     hssfworkbook = new XSSFWorkbook(file);
 29                 }
 30             }
 31             catch (Exception e)
 32             {
 33                 LogHandler.LogWrite(string.Format("文件{0}打开失败,错误:{1}", new string[] { fileName, e.ToString() }));
 34             }
 35             //循环Sheet页
 36             int sheetsCount = hssfworkbook.NumberOfSheets;
 37             List<List<List<string>>> workBookContent = new List<List<List<string>>>();
 38             for (int i = 0; i < sheetsCount; i++)
 39             {
 40                 //Sheet索引从0开始
 41                 ISheet sheet = hssfworkbook.GetSheetAt(i);
 42                 //循环行
 43                 List<List<string>> sheetContent = new List<List<string>>();
 44                 int rowCount = sheet.PhysicalNumberOfRows;
 45                 for (int j = 0; j < rowCount; j++)
 46                 {
 47                     //Row(逻辑行)的索引从0开始
 48                     IRow row = sheet.GetRow(j);
 49                     //循环列(各行的列数可能不同)
 50                     List<string> rowContent = new List<string>();
 51                     int cellCount = row.PhysicalNumberOfCells;
 52                     for (int k = 0; k < cellCount; k++)
 53                     {
 54                         //ICell cell = row.GetCell(k);
 55                         ICell cell = row.Cells[k];
 56                         if (cell == null)
 57                         {
 58                             rowContent.Add("NIL");
 59                         }
 60                         else
 61                         {
 62                             rowContent.Add(cell.ToString());
 63                             //rowContent.Add(cell.StringCellValue);
 64                         }
 65                     }
 66                     //添加行到集合中
 67                     sheetContent.Add(rowContent);
 68                 }
 69                 //添加Sheet到集合中
 70                 workBookContent.Add(sheetContent);
 71             }
 72 
 73             return workBookContent;
 74         }
 75 
 76         /// <summary>
 77         /// 
 78         /// </summary>
 79         /// <param name="fileName"></param>
 80         /// <returns></returns>
 81         public static string ReadExcelText(string fileName)
 82         {
 83             string ExcelCellSeparator = ConfigurationManager.AppSettings["ExcelCellSeparator"];
 84             string ExcelRowSeparator = ConfigurationManager.AppSettings["ExcelRowSeparator"];
 85             string ExcelSheetSeparator = ConfigurationManager.AppSettings["ExcelSheetSeparator"];
 86             //
 87             List<List<List<string>>> excelContent = ReadExcel(fileName);
 88             string fileText = string.Empty;
 89             StringBuilder sbFileText = new StringBuilder();
 90             //循环处理WorkBook中的各Sheet页
 91             List<List<List<string>>>.Enumerator enumeratorWorkBook = excelContent.GetEnumerator();
 92             while (enumeratorWorkBook.MoveNext())
 93             {
 94 
 95                 //循环处理当期Sheet页中的各行
 96                 List<List<string>>.Enumerator enumeratorSheet = enumeratorWorkBook.Current.GetEnumerator();
 97                 while (enumeratorSheet.MoveNext())
 98                 {
 99 
100                     string[] rowContent = enumeratorSheet.Current.ToArray();
101                     sbFileText.Append(string.Join(ExcelCellSeparator, rowContent));
102                     sbFileText.Append(ExcelRowSeparator);
103                 }
104                 sbFileText.Append(ExcelSheetSeparator);
105             }
106             //
107             fileText = sbFileText.ToString();
108             return fileText;
109         }
110 
111         /// <summary>
112         /// 读取Word内容
113         /// </summary>
114         /// <param name="fileName"></param>
115         /// <returns></returns>
116         public static string ReadWordText(string fileName)
117         {
118             string WordTableCellSeparator = ConfigurationManager.AppSettings["WordTableCellSeparator"];
119             string WordTableRowSeparator = ConfigurationManager.AppSettings["WordTableRowSeparator"];
120             string WordTableSeparator = ConfigurationManager.AppSettings["WordTableSeparator"];
121             //
122             string CaptureWordHeader = ConfigurationManager.AppSettings["CaptureWordHeader"];
123             string CaptureWordFooter = ConfigurationManager.AppSettings["CaptureWordFooter"];
124             string CaptureWordTable = ConfigurationManager.AppSettings["CaptureWordTable"];
125             string CaptureWordImage = ConfigurationManager.AppSettings["CaptureWordImage"];
126             //
127             string CaptureWordImageFileName = ConfigurationManager.AppSettings["CaptureWordImageFileName"];
128             //
129             string fileText = string.Empty;
130             StringBuilder sbFileText = new StringBuilder();
131 
132             #region 打开文档
133             XWPFDocument document = null;
134             try
135             {
136                 using (FileStream file = new FileStream(fileName, FileMode.Open, FileAccess.Read))
137                 {
138                     document = new XWPFDocument(file);
139                 }
140             }
141             catch (Exception e)
142             {
143                 LogHandler.LogWrite(string.Format("文件{0}打开失败,错误:{1}", new string[] { fileName, e.ToString() }));
144             }
145             #endregion
146 
147             #region 页眉、页脚
148             //页眉
149             if (CaptureWordHeader == "true")
150             {
151                 sbFileText.AppendLine("Capture Header Begin");
152                 foreach (XWPFHeader xwpfHeader in document.HeaderList)
153                 {
154                     sbFileText.AppendLine(string.Format("{0}", new string[] { xwpfHeader.Text }));
155                 }
156                 sbFileText.AppendLine("Capture Header End");
157             }
158             //页脚
159             if (CaptureWordFooter == "true")
160             {
161                 sbFileText.AppendLine("Capture Footer Begin");
162                 foreach (XWPFFooter xwpfFooter in document.FooterList)
163                 {
164                     sbFileText.AppendLine(string.Format("{0}", new string[] { xwpfFooter.Text }));
165                 }
166                 sbFileText.AppendLine("Capture Footer End");
167             }
168             #endregion
169 
170             #region 表格
171             if (CaptureWordTable == "true")
172             {
173                 sbFileText.AppendLine("Capture Table Begin");
174                 foreach (XWPFTable table in document.Tables)
175                 {
176                     //循环表格行
177                     foreach (XWPFTableRow row in table.Rows)
178                     {
179                         foreach (XWPFTableCell cell in row.GetTableCells())
180                         {
181                             sbFileText.Append(cell.GetText());
182                             //
183                             sbFileText.Append(WordTableCellSeparator);
184                         }
185 
186                         sbFileText.Append(WordTableRowSeparator);
187                     }
188                     sbFileText.Append(WordTableSeparator);
189                 }
190                 sbFileText.AppendLine("Capture Table End");
191             }
192             #endregion
193 
194             #region 图片
195             if (CaptureWordImage == "true")
196             {
197                 sbFileText.AppendLine("Capture Image Begin");
198                 foreach (XWPFPictureData pictureData in document.AllPictures)
199                 {
200                     string picExtName = pictureData.suggestFileExtension();
201                     string picFileName = pictureData.GetFileName();
202                     byte[] picFileContent = pictureData.GetData();
203                     //
204                     string picTempName = string.Format(CaptureWordImageFileName, new string[] { Guid.NewGuid().ToString() + "_" + picFileName + "." + picExtName });
205                     //
206                     using (FileStream fs = new FileStream(picTempName, FileMode.Create, FileAccess.Write))
207                     {
208                         fs.Write(picFileContent, 0, picFileContent.Length);
209                         fs.Close();
210                     }
211                     //
212                     sbFileText.AppendLine(picTempName);
213                 }
214                 sbFileText.AppendLine("Capture Image End");
215             }
216             #endregion
217 
218             //正文段落
219             sbFileText.AppendLine("Capture Paragraph Begin");
220             foreach (XWPFParagraph paragraph in document.Paragraphs)
221             {
222                 sbFileText.AppendLine(paragraph.ParagraphText);
223 
224             }
225             sbFileText.AppendLine("Capture Paragraph End");
226             //
227 
228             //
229             fileText = sbFileText.ToString();
230             return fileText;
231         }
232 
233 
234     }
235 }

 

 

posted @ 2014-05-30 23:04  马洪彪  阅读(19629)  评论(4编辑  收藏  举报