用自己代码逐个字符解析的速度较慢,尝试了libxml2也比较慢,它需要一次性读入内存,而expat库支持流式读取。就让龙猫写了一个程序,毕竟是久经考验的库,程序很快就调试通过了。要不是我一开始没信心,让他先输出10行试试,还能少走很多弯路。

#include <stdio.h>
  #include <stdlib.h>
    #include <string.h>
      #include <expat.h>
        #define MAX_CELL_CONTENT 256
        // 解析范围
        typedef struct {
        int start_row;
        int end_row;
        char start_col;
        char end_col;
        } ParseRange;
        // 全局结果存储
        typedef struct {
        int row;
        char col;
        char value[MAX_CELL_CONTENT];
        } CellResult;
        // 范围检查
        int is_cell_in_range(int row, char col, ParseRange range) {
        return (row >= range.start_row && row <= range.end_row &&
        col >= range.start_col && col <= range.end_col);
        }
        // 全局结果存储(动态数组)
        typedef struct {
        CellResult *data;
        // 动态数组
        int count;
        // 当前数量
        int capacity;
        // 当前容量
        } DynamicResults;
        DynamicResults all_results = {
        0
        };
        // 全局变量
        // 初始化动态数组
        void init_results() {
        all_results.capacity = 1024;
        // 初始容量
        all_results.data = malloc(all_results.capacity * sizeof(CellResult));
        all_results.count = 0;
        }
        // 扩容动态数组
        void ensure_capacity(int needed) {
        if (all_results.count + needed >= all_results.capacity) {
        all_results.capacity *= 2;
        // 翻倍扩容
        all_results.data = realloc(all_results.data, all_results.capacity * sizeof(CellResult));
        }
        }
        // 添加结果(无限制版本)
        void add_cell_result(int row, char col, const char *value, int is_empty) {
        ensure_capacity(1);
        // 确保有空间
        all_results.data[all_results.count].row = row;
        all_results.data[all_results.count].col = col;
        strncpy(all_results.data[all_results.count].value, value, MAX_CELL_CONTENT - 1);
        all_results.count++;
        }
        // 释放内存
        void free_results() {
        free(all_results.data);
        all_results.data = NULL;
        all_results.count = all_results.capacity = 0;
        }
        // 解析Excel范围 (如"A1:Z100")
        int parse_excel_range(const char *range_str, ParseRange *range) {
        if (sscanf(range_str, "%c%d:%c%d",
        &range->start_col, &range->start_row,
        &range->end_col, &range->end_row) != 4) {
        return -1;
        }
        if (range->start_col > range->end_col) return -1;
        if (range->start_row > range->end_row) return -1;
        return 0;
        }
        // 解析器状态
        typedef struct {
        ParseRange range;
        int in_row;
        int current_row;
        char current_col;
        int value_started;
        char temp_value[MAX_CELL_CONTENT];
        int value_len;
        int rows_parsed;
        // 已解析行数
        } ParserState;
        // 开始标签回调
        void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {
        ParserState *state = (ParserState*)user_data;
        if (strcmp(name, "row") == 0) {
        state->in_row = 1;
        state->current_row = -1;
        // 解析行号属性
        for (int i = 0; attrs[i]; i += 2) {
        if (strcmp(attrs[i], "r") == 0) {
        state->current_row = atoi(attrs[i+1]);
        state->rows_parsed++;
        break;
        }
        }
        if(1==0)printf("解析行 %d\n", state->current_row);
        }
        else if (strcmp(name, "c") == 0 && state->in_row) {
        // 解析列属性
        for (int i = 0; attrs[i]; i += 2) {
        if (strcmp(attrs[i], "r") == 0) {
        state->current_col = attrs[i+1][0];
        break;
        }
        }
        }
        else if (strcmp(name, "v") == 0 || strcmp(name, "t") == 0) {
        if (state->current_row >= state->range.start_row &&
        state->current_row <= state->range.end_row) {
          state->value_started = 1;
          state->value_len = 0;
          state->temp_value[0] = '\0';
          }
          }
          }
          // 文本内容回调
          void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {
          ParserState *state = (ParserState*)user_data;
          if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {
          memcpy(state->temp_value + state->value_len, s, len);
          state->value_len += len;
          state->temp_value[state->value_len] = '\0';
          }
          }
          // 结束标签回调
          void XMLCALL end_element(void *user_data, const XML_Char *name) {
          ParserState *state = (ParserState*)user_data;
          if (strcmp(name, "row") == 0) {
          state->in_row = 0;
          }
          else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {
          if (is_cell_in_range(state->current_row, state->current_col, state->range)) {
          if(1==0)printf(" 单元格 %c%d: '%s'\n", state->current_col, state->current_row, state->temp_value);
          add_cell_result(state->current_row, state->current_col, state->temp_value, 0);
          }
          state->value_started = 0;
          }
          }
          // 主解析函数
          int parse_sheet_xml(const char *filename, ParseRange range) {
          XML_Parser parser = XML_ParserCreate(NULL);
          ParserState state = {
          0
          };
          state.range = range;
          XML_SetUserData(parser, &state);
          XML_SetElementHandler(parser, start_element, end_element);
          XML_SetCharacterDataHandler(parser, character_data);
          FILE *file = fopen(filename, "rb");
          if (!file) {
          printf("错误: 无法打开文件 %s\n", filename);
          XML_ParserFree(parser);
          return -1;
          }
          char buffer[8192];
          int done;
          do {
          size_t len = fread(buffer, 1, sizeof(buffer), file);
          done = (len <
          sizeof(buffer));
          if (XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR) {
          printf("解析错误: %s (行 %d)\n",
          XML_ErrorString(XML_GetErrorCode(parser)),
          XML_GetCurrentLineNumber(parser));
          break;
          }
          } while (!done);
          fclose(file);
          XML_ParserFree(parser);
          return 0;
          }
          /**
          * 输出CSV文件
          * @param filename 输出文件名
          */
          int save_results_to_csv(const char *filename) {
          FILE *csv = fopen(filename, "w");
          if (!csv) {
          printf("错误: 无法创建CSV文件 %s\n", filename);
          return -1;
          }
          // 计算列范围
          char min_col = all_results.data[0].col;
          char max_col = all_results.data[0].col;
          for (int i = 1; i < all_results.count; i++) {
          if (all_results.data[i].col < min_col) min_col = all_results.data[i].col;
          if (all_results.data[i].col > max_col) max_col = all_results.data[i].col;
          }
          // 输出标题
          fprintf(csv, "Row");
          for (char col = min_col; col <= max_col; col++) {
          fprintf(csv, ",%c", col);
          }
          fprintf(csv, "\n");
          // 数据行
          int current_row = all_results.data[0].row;
          int row_start_idx = 0;
          for (int i = 0; i < all_results.count; i++) {
          if (all_results.data[i].row != current_row) {
          // 输出当前行
          fprintf(csv, "%d", current_row);
          for (char col = min_col; col <= max_col; col++) {
          int found = 0;
          for (int j = row_start_idx; j < i; j++) {
          if (all_results.data[j].col == col) {
          fprintf(csv, ",%s", all_results.data[j].value);
          found = 1;
          break;
          }
          }
          if (!found) fprintf(csv, ",");
          }
          fprintf(csv, "\n");
          // 下一行
          current_row = all_results.data[i].row;
          row_start_idx = i;
          }
          }
          fclose(csv);
          printf("CSV已保存到 %s\n", filename);
          return 0;
          }
          // 主函数
          int main(int argc, char *argv[]) {
          if (argc != 3) {
          printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);
            return 1;
            }
            ParseRange range;
            if (parse_excel_range(argv[2], &range) != 0) {
            printf("错误: 无效范围格式,应为 A1:Z100\n");
            return 1;
            }
            printf("解析范围: %c%d:%c%d\n", range.start_col, range.start_row, range.end_col, range.end_row);
            init_results();
            if (parse_sheet_xml(argv[1], range) == 0) {
            // 生成CSV文件名(替换.xml为.csv)
            char csv_filename[256];
            strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);
            char *ext = strrchr(csv_filename, '.');
            if (ext) strcpy(ext, ".csv");
            else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);
            // 输出CSV
            save_results_to_csv(csv_filename);
            }
            free_results();
            return 0;
            }

编译执行

gcc -o expatxml3 expatxml3.c -lexpat -O3
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A1:Z10000
解析范围: A1:Z10000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv
real 0m6.508s
user 0m2.132s
sys 0m0.392s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A100001:Z110000
解析范围: A100001:Z110000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv
real 0m6.534s
user 0m2.111s
sys 0m0.431s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A1:Z1000000
解析范围: A1:Z1000000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv
real 0m10.207s
user 0m3.046s
sys 0m1.795s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A300000:Z660000
解析范围: A300000:Z660000
CSV已保存到 lineitem/xl/worksheets/sheet1.csv
real 0m9.378s
user 0m2.574s
sys 0m1.030s

针对60万行16列,300MB xml, 这个时间还不错,但是没有考虑sharedstrings.xml,否则会慢一些。

posted on 2025-09-20 12:32  lxjshuju  阅读(7)  评论(0)    收藏  举报