BatchFileProcessing(2)--实现之解析文件

     前一篇写到BatchFileProcessing的流程设计,这篇总结下解析文件的实现。

     目前我的产品支持用户上传excel,csv,tab delimited等三种文件格式,鉴于office com组件性能太差,所以我们购买了aspose cells组件。下面我们以批量更新商品库存为例,阐述一下解析文件的实现。

     当我们从数据库读取到尚未处理的用户上传文件后,我们会将它下载到服务器磁盘上,然后开始解析。

  文件级别的检查下包括以下几项:

  (1)文件是否可以正常打开,如果不能正常打开,记录异常并通知客户。

  (2)文件中的关键列是否有缺失,如果有缺失,记录异常并通知客户。

  (3)文件中的列是否有重复,如果有重复,记录异常并通知客户。

  (4)文件中是否有未在模版中未定义的列,如果有未在模版中未定义的列,记录异常并通知客户。

     批量更新商品库存的业务检查下包括以下几项:

(1)商品编号是否提供,如果没有提供,记录异常到数据库。

(2)商品库存是否提供,如果没有提供,使用默认值零;如果提供了,验证是否在有效的范围内,比如0~999999,如果不合法,记录异常到数据库。

用 Aspose Cells组件读取数据:

 

  1 using System;
  2 using System.Collections.Generic;
  3 using System.Linq;
  4 using System.Text;
  5 using Aspose.Cells;
  6 
  7 namespace BatchFile.Jobs.FileParsers
  8 {
  9     public class AsposeCellParser : IExcelParser
 10     {
 11         private string m_fileName;
 12         private string m_fileExt;
 13         private FileFormatType m_fileFormatType;
 14         private Workbook m_workbook;
 15         private Worksheet m_worksheet;
 16 
 17         public AsposeCellParser(string fileName, string fileExt)
 18         {
 19             m_fileName = fileName;
 20             m_fileExt = fileExt;
 21             if (FileExtType.IsCSV(m_fileExt))
 22             {
 23                 m_fileFormatType = FileFormatType.CSV;
 24             }
 25             else if (FileExtType.IsExcel2007(m_fileExt))
 26             {
 27                 m_fileFormatType = FileFormatType.Excel2007Xlsx;
 28             }
 29             else if (FileExtType.IsExcel2003(m_fileExt))
 30             {
 31                 m_fileFormatType = FileFormatType.Excel2003;
 32             }
 33             else if (FileExtType.IsTabDelimited(m_fileExt))
 34             {
 35                 m_fileFormatType = FileFormatType.TabDelimited;
 36             }
 37             else
 38             {
 39                 throw new NotSupportedException(string.Format("File extension({0}) cannot be supported.", fileExt));
 40             }
 41         }
 42 
 43         #region IExcelParser
 44 
 45         public string ExcelFile
 46         {
 47             get { return m_fileName; }
 48         }
 49 
 50         public int RowCount
 51         {
 52             get { return m_worksheet.Cells.MaxDataRow; }
 53         }
 54 
 55         public int ColumnCount
 56         {
 57             get { return m_worksheet.Cells.MaxDataColumn; }
 58         }
 59 
 60         public object GetCellValue(int row, int col)
 61         {
 62             return m_worksheet.Cells[row, col].Value;
 63         }
 64 
 65         public void ActiveWorksheet(int activeWorksheetIndex)
 66         {
 67             if (activeWorksheetIndex < 0 || activeWorksheetIndex >= m_workbook.Worksheets.Count)
 68             {
 69                 throw new ArgumentOutOfRangeException("Worksheet index is out of range.");
 70             }
 71             m_worksheet = m_workbook.Worksheets[activeWorksheetIndex];
 72         }
 73 
 74         public void ActiveWorksheet(string activeWorksheetName)
 75         {
 76             if (string.IsNullOrWhiteSpace(activeWorksheetName))
 77             {
 78                 throw new ArgumentException("Worksheet name cannot be null or empty.");
 79             }
 80             m_worksheet = m_workbook.Worksheets[activeWorksheetName];
 81         }
 82 
 83         public void OpenFile()
 84         {
 85             m_workbook = new Workbook();
 86             m_workbook.ConvertNumericData = false;
 87             m_workbook.Open(m_fileName, m_fileFormatType);
 88             m_worksheet = m_workbook.Worksheets[0];
 89         }
 90 
 91         public void OpenFile(int activeWorksheetIndex)
 92         {
 93             OpenFile();
 94             ActiveWorksheet(activeWorksheetIndex);
 95         }
 96 
 97         public void OpenFile(string activeWorksheetName)
 98         {
 99             OpenFile();
100             ActiveWorksheet(activeWorksheetName);
101         }
102 
103         #endregion
104     }
105 }
 
文件检查与解析
  1  public void Process()
  2         {
  3             m_parser = new AsposeCellParser(m_batchInventoryFile.FileName, m_fileExt);
  4 
  5             //open file
  6             try
  7             {
  8                 m_parser.OpenFile();
  9             }
 10             catch
 11             {
 12                 EmailUtil.Send(m_batchInventoryFile.UserEmail, Consts.Batch_Inventory_File_Can_Not_Open, "File you uploaded cannot be open.");
 13                 return;
 14             }
 15 
 16             //resolve headers
 17             var dictMapping = ConfigManager.BatchFileConfig.BatchInventory.InventoryPropertyMappingDict;
 18             List<ExcelHeaderModel> headerList = new List<ExcelHeaderModel>();
 19             List<string> headerNameList = new List<string>();
 20             int rowIndex = 0;
 21             int colIndex = 0;
 22             for (; colIndex < m_parser.ColumnCount; colIndex++)
 23             {
 24                 string headerName = m_parser.GetCellValue(rowIndex, colIndex).ToString().Trim();
 25                 ExcelHeaderModel header = new ExcelHeaderModel();
 26                 header.ColumnIndex = colIndex;
 27                 header.HeaderName = headerName;
 28                 header.IsDuplicated = headerNameList.Contains(headerName);
 29                 
 30                 headerList.Add(header);
 31                 headerNameList.Add(headerName);
 32             }
 33 
 34             //check missing key column or not
 35             List<string> lostKeyList = new List<string>();
 36             foreach (var key in dictMapping.Keys)
 37             {
 38                 var mapping = dictMapping[key];
 39                 if (mapping.IsKey
 40                     && !headerNameList.Contains(mapping.HeaderName))
 41                 {
 42                     lostKeyList.Add(mapping.HeaderName);
 43                 }
 44             }
 45             if (lostKeyList.Count > 0)
 46             {
 47                 string keys = lostKeyList[0];
 48                 for (int i = 1; i < lostKeyList.Count; i++)
 49                 {
 50                     keys += string.Format(",{0}", lostKeyList[i]);
 51                 }
 52 
 53                 EmailUtil.Send(m_batchInventoryFile.UserEmail,
 54                     Consts.Batch_Inventory_File_Missing_Key,
 55                     string.Format("File you uploaded is missing key(s) {0}.", keys));
 56                 return;
 57             }
 58 
 59             //check exist duplicate columns or not
 60             var duplicateColumnList = headerList.FindAll(h => h.IsDuplicated);
 61             if (duplicateColumnList.Count > 0)
 62             {
 63                 string duplicateColumns = duplicateColumnList[0].HeaderName;
 64                 for (int i = 1; i < duplicateColumnList.Count; i++)
 65                 {
 66                     duplicateColumns += string.Format(",{0}", duplicateColumnList[i].HeaderName);
 67                 }
 68                 EmailUtil.Send(m_batchInventoryFile.UserEmail,
 69                    Consts.Batch_Inventory_File_Having_Duplicated_Column,
 70                    string.Format("File you uploaded has duplicated column(s) {0}.", duplicateColumns));
 71                 return;
 72             }
 73 
 74             //check exist undefined columns or not
 75             var undefinedColumnList = headerList.FindAll(h => !h.IsTemplateDefined);
 76             if (undefinedColumnList.Count > 0)
 77             {
 78                 string undefinedColumns = undefinedColumnList[0].HeaderName;
 79                 for (int i = 1; i < undefinedColumnList.Count; i++)
 80                 {
 81                     undefinedColumns += string.Format(",{0}", undefinedColumnList[i].HeaderName);
 82                 }
 83                 EmailUtil.Send(m_batchInventoryFile.UserEmail,
 84                    Consts.Batch_Inventory_File_Having_Undefined_Column,
 85                    string.Format("File you uploaded has undefined column(s) {0}.", undefinedColumns));
 86                 return;
 87             }
 88 
 89             //mapping header according to configuration
 90             foreach (var header in headerList)
 91             {
 92                 if (dictMapping.Keys.Contains(header.HeaderName))
 93                 {
 94                     header.IsTemplateDefined = true;
 95                     var mapping = dictMapping[header.HeaderName];
 96                     header.IsKey = mapping.IsKey;
 97                     header.IsTransactional = mapping.IsTransactional;
 98                     header.PropertyName = mapping.PropertyName;
 99                     header.DataType = mapping.DataType;
100                     header.DefaultValue = mapping.DefaultValue;
101                     header.ColumnName = mapping.ColumnName;
102                     header.IsSSBNode = mapping.IsSSBNode;
103                 }
104             }
105 
106             //check and extract business data
107             if (!Directory.Exists(ConfigManager.BatchFileConfig.BatchInventory.File2Dir))
108             {
109                 Directory.CreateDirectory(ConfigManager.BatchFileConfig.BatchInventory.File2Dir);
110             }
111             var partitionList = Partitioner.Create(1, m_parser.RowCount + 1);
112             Parallel.ForEach(partitionList, (p, loopState) =>
113                 {
114                     DataTable dt = CreateDataTable(headerList);
115                     for (int i = p.Item1; i < p.Item2; i++)
116                     {
117                         DataRow dr = dt.NewRow();
118                         XElement xNode = new XElement(ConfigManager.BatchFileConfig.BatchInventory.RootPropertyName);
119                         List<string> rowErrorList = new List<string>();
120                        
121                         foreach (var header in headerList)
122                         {
123                             object value = null;
124                             string strValue = string.Empty;
125                             if (header.IsKey || header.IsTransactional || header.IsSSBNode)
126                             {
127                                 value = m_parser.GetCellValue(i, header.ColumnIndex);
128                                 strValue = value.ToString().Trim();
129                                 if (string.IsNullOrEmpty(strValue))
130                                 {
131                                     strValue = header.DefaultValue;
132                                 }
133                                 //find property's validator and perform validation
134                                 if (header.IsTemplateDefined)
135                                 {
136                                     var property = dictMapping[header.HeaderName];
137                                     if (!PropertyValidator.Validate(property.Validator, strValue))
138                                     {
139                                         rowErrorList.Add(string.Format(property.Validator.Tips, property.PropertyName));
140                                     }
141                                 }
142                             }
143                             if (header.IsTransactional)
144                             {
145                                 dr[header.ColumnName] = value;
146                             }
147                             if (header.IsSSBNode)
148                             {
149                                 xNode.Add(new XElement(header.PropertyName, new XCData(strValue)));
150                             }
151                         }
152 
153                         dr["BatchFileID"= m_batchInventoryFile.TransactionNumber;
154                         dr["RowIndex"= i.ToString();
155                         if (rowErrorList.Count == 0)
156                         {
157                             string fileName = i.ToString() + ".xml";
158                             string filePath = Path.Combine(ConfigManager.BatchFileConfig.BatchInventory.File2Dir, fileName);
159                             File.WriteAllText(filePath, xNode.ToString());
160                             dr["FileName"= fileName;
161                             dr["CheckResult"= Consts.Success;
162                         }
163                         else
164                         {
165                             dr["CheckResult"= Consts.Failed;
166                             XElement xNodeCheckMemo = new XElement("CheckMemoList", from error in rowErrorList
167                                                                                 select new XElement("CheckMemo", error));
168                             dr["CheckMemo"=xNodeCheckMemo.ToString();
169                         }
170                         dr["HasCheck"= Consts.Yes;
171                         dt.Rows.Add(dr);
172 
173                         if (dt.Rows.Count == ConfigManager.BatchFileConfig.BatchInventory.BatchSize)
174                         {
175                             //write to DB
176                             SQLHelper.BulkCopy(dt);
177                             dt.Rows.Clear();
178                         }
179                     }
180 
181                     //process the last batch
182                     if (dt.Rows.Count > 0)
183                     {
184                         //write to DB
185                         SQLHelper.BulkCopy(dt);
186                         dt.Rows.Clear();
187                     }
188                 });
189         }
190 
191         private DataTable CreateDataTable(List<ExcelHeaderModel> headerList)
192         {
193             DataTable dt = new DataTable(ConfigManager.BatchFileConfig.BatchInventory.DataTableName);
194             foreach (var header in headerList)
195             {
196                 if (header.IsTransactional)
197                 {
198                     dt.Columns.Add(header.ColumnName, Type.GetType(header.DataType));
199                 }
200             }
201             dt.Columns.Add("BatchFileID"typeof(int));
202             dt.Columns.Add("RowIndex"typeof(int));
203             dt.Columns.Add("FileName"typeof(string));
204             dt.Columns.Add("HasCheck"typeof(string));
205             dt.Columns.Add("CheckResult"typeof(string));
206             dt.Columns.Add("CheckMemo"typeof(string));
207 
208             return dt;
209         }
 
为了提高文件解析的性能使用了多线程以及SqlBulkCopy等技术。下面是SQLHelper的定义:
 1 public static class SQLHelper
 2     {
 3         private static string m_connectionString = ConfigurationManager.ConnectionStrings["MSSQL"].ConnectionString;
 4         public static string ConnectionString
 5         {
 6             get
 7             {
 8                 return m_connectionString;
 9             }
10         }
11 
12         public static void BulkCopy(DataTable dt, string destTableName, List<SqlBulkCopyColumnMapping> columnMappingList, int timeoutSeconds = 1200)
13         {
14             using (SqlConnection conn = new SqlConnection(m_connectionString))
15             {
16                 conn.Open();
17                 SqlTransaction trans = conn.BeginTransaction(IsolationLevel.ReadCommitted, "Bulk copy batch data");
18                 SqlBulkCopy b = new SqlBulkCopy(conn, SqlBulkCopyOptions.Default, trans);
19                 b.BulkCopyTimeout = timeoutSeconds;
20                 b.DestinationTableName = destTableName;
21                 if (columnMappingList != null)
22                 {
23                     foreach (var colMapping in columnMappingList)
24                     {
25                         b.ColumnMappings.Add(colMapping);
26                     }
27                 }
28                 b.WriteToServer(dt);
29                 trans.Commit();
30                 b.Close();
31                 conn.Close();
32             }
33         }
34 
35         public static void BulkCopy(DataTable dt)
36         {
37             List<SqlBulkCopyColumnMapping> columnMappingList = new List<SqlBulkCopyColumnMapping>();
38             foreach (DataColumn col in dt.Columns)
39             {
40                 columnMappingList.Add(new SqlBulkCopyColumnMapping
41                 {
42                     SourceColumn = col.ColumnName,
43                     DestinationColumn = col.ColumnName
44                 });
45             }
46             BulkCopy(dt, dt.TableName, columnMappingList);
47         }
48     }

 

posted on 2011-04-05 09:41  James.H.Fu  阅读(1129)  评论(1编辑  收藏  举报

导航