以下代码用来爬取近30年来地震数据,实际运行时间为4.5小时(博主电脑配置较低)
static void Main(string[] args) { FileStream fw = new FileStream("1.txt", FileMode.OpenOrCreate); FileStream fw2 = new FileStream("2.txt", FileMode.OpenOrCreate); StreamWriter sw = new StreamWriter(fw); StreamWriter sw2 = new StreamWriter(fw2); WebClient wc = new WebClient(); wc.Encoding = Encoding.UTF8; //以字符串的形式返回数据 string page = "1"; string 起始强度 = "1"; string 终止强度 = "10"; string 起始日期 = "1990-01-01"; string 终止日期 = "1990-02-01"; int 页数 = 1; string send_message; string html; //以正则表达式的形式匹配到字符串网页中想要的数据 MatchCollection matches; //依次取得匹配到的数据 for (int m=1990;m<=2020;m++) { for(int n=1;n<=12;n++) { Console.WriteLine(m*100+n); for (int i = 1; i <= 页数; i++) { send_message = "http://ditu.92cha.com/dizhen.php?page=" + i.ToString() + "&dizhen_ly=usa&dizhen_zjs=" + 起始强度 + "&dizhen_zje=" + 终止强度 + "&dizhen_riqis=" + m.ToString() + "-" + n.ToString() + "-01" + "&dizhen_riqie=" + m.ToString() + "-" + n.ToString() + "-31"; html = wc.DownloadString(send_message); matches = Regex.Matches(html, "text-center\">(.*)</td"); //依次取得匹配到的数据 foreach (Match item in matches) { sw.WriteLine(item.Groups[1].Value); } MatchCollection matches2 = Regex.Matches(html, "条记录,分(.*)页显示"); //匹配页数 MatchCollection matches3 = Regex.Matches(html, "_blank\">(.*)</a>"); foreach (Match item in matches2) { 页数 = Convert.ToInt32(item.Groups[1].Value); } foreach (Match item in matches3) { sw2.WriteLine(item.Groups[1].Value); } } } } Console.ReadKey(); }
爬取完需要对数据进一步处理,这里分别写进xlsx和数据库,以下是xlsx的代码,需要在依赖项中添加spire.xls:
using Spire.Xls; namespace txt_to_xml { class Program { static void Main(string[] args) { FileStream fw = new FileStream("1.txt", FileMode.OpenOrCreate); FileStream fw2 = new FileStream("2.txt", FileMode.OpenOrCreate); FileStream fw3 = new FileStream("3.txt", FileMode.OpenOrCreate); StreamReader r1 = new StreamReader(fw); StreamReader r2 = new StreamReader(fw2); StreamWriter w1=new StreamWriter(fw3); Workbook workbook; Worksheet sheet; workbook = new Workbook(); workbook.LoadFromFile("1.xlsx"); sheet = workbook.Worksheets[0];int i = 0; for (int k = 1; k <= 1900391; k++) { i = k ; sheet.Range[i, 1].Text = r1.ReadLine(); sheet.Range[i, 2].Text = r1.ReadLine(); sheet.Range[i, 3].Text = r1.ReadLine(); sheet.Range[i, 4].Text = r1.ReadLine(); sheet.Range[i, 5].Text = r1.ReadLine(); sheet.Range[i, 6].Text = r2.ReadLine(); sheet.Range[i, 6].Text = r2.ReadLine(); } w1.Close(); Console.WriteLine(i); FileViewer(path); workbook.SaveToFile("1.xlsx", ExcelVersion.Version2010); } } }
值得一提的是,xls仅支持记录 6万多条,xlsx也仅可以记录100万条,更多记录可能需要使用数据库,(C#操作数据库方法库参看数据库专题)