批量采集培训机构数据
最近做一个培训机构学校查询网,发现有个这方面的数据,所以使用php写了这个接口进行查询。
在php环境新建个peixun.php文件,
代码如下:
<?php // 接口地址 $url = 'https://xwpx.eduyun.cn/tolSpInfo/getSpInfoList'; $page = $_GET['page'] ?? 1; // 提供默认页码,避免未传参数错误 // 定义省级地区编码数组 $numbers = [ 110000, 120000, 130000, 140000, 150000, 210000, 220000, 230000, 310000, 320000, 330000, 340000, 350000, 360000, 370000, 410000, 420000, 430000, 440000, 450000, 460000, 500000, 510000, 520000, 530000, 540000, 610000, 620000, 630000, 640000, 650000, 660000 ]; // 获取并验证地区参数 $area = $_GET['area'] ?? ''; $areaid = isset($numbers[$area]) ? $numbers[$area] : ''; if (empty($areaid)) { echo json_encode(['error' => '无效的地区参数'], JSON_UNESCAPED_UNICODE); exit(); } /** * 获取子地区编码列表 * [url=home.php?mod=space&uid=952169]@Param[/url] string $areaid 父地区编码 * [url=home.php?mod=space&uid=155549]@Return[/url] array 子地区编码数组 */ function getareaid($areaid) { // 接口URL $url = 'https://xwpx.eduyun.cn/xspxRegister/getChildArea'; // 请求参数 $data = [ 'PAGE_SERIAL_VERSION_UID' => '', 'areaCode' => $areaid ]; // 构建POST数据 $postData = http_build_query($data); // 创建cURL资源 $ch = curl_init(); // 设置URL和相应的选项 curl_setopt_array($ch, [ CURLOPT_URL => $url, CURLOPT_POST => true, CURLOPT_POSTFIELDS => $postData, CURLOPT_RETURNTRANSFER => true, CURLOPT_HTTPHEADER => [ 'Referer: https://xwpx.eduyun.cn/tol/toHomePageParentServices', 'User-Agent: mozilla/5.0 (macintosh; intel mac os x 10_15_1) applewebkit/537.36 (khtml, like gecko) brave chrome/78.0.3904.70 safari/537.36 Edg/139.0.0.0', 'Content-Type: application/x-www-form-urlencoded' ], CURLOPT_COOKIE => 'HWWAFSESID=d82bf97755bdb320d6; HWWAFSESTIME=1755569214521; SESSION=552af9b3-ac7a-4bba-b35a-27978f142f24; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk4YzAxNDY3MTE5YWItMDkxNmMxNmMxNmMxNmMtNGM2NTdiNTgtMjA3MzYwMC0xOThjMDE0NjcxMjExM2QifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%7D; Hm_lvt_c3f009f814f701e8fad8a17f9682ec79=1755596687; HMACCOUNT=497681945067787A; Hm_lpvt_c3f009f814f701e8fad8a17f9682ec79=1755596697', CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false ]); // 执行cURL请求并获取响应 $response = curl_exec($ch); // 检查是否有错误发生 if (curl_errno($ch)) { error_log('cURL错误: ' . curl_error($ch)); curl_close($ch); return []; } // 关闭cURL资源 curl_close($ch); // 解析JSON响应 $result = json_decode($response, true); // 检查解析是否成功 if (json_last_error() !== JSON_ERROR_NONE) { error_log('JSON解析错误: ' . json_last_error_msg()); return []; } // 提取并返回子地区编码 $areaCodes = []; if ($result['retCode'] === '000000' && !empty($result['data'])) { foreach ($result['data'] as $item) { if (!empty($item['areaCode'])) { $areaCodes[] = $item['areaCode']; } } } return $areaCodes; } // 获取子地区编码列表 $childAreaCodes = getareaid($areaid); // 如果没有子地区,直接退出 if (empty($childAreaCodes)) { echo json_encode(['error' => '未获取到子地区编码'], JSON_UNESCAPED_UNICODE); exit(); } // 处理其他请求参数 $object = $_GET['object'] ?? ''; if ($object == 5) { $object = "0,1,2,3,4"; } $profitType = $_GET['type'] ?? ''; // 获取当前循环索引,默认为0 $currentIndex = isset($_GET['index']) ? intval($_GET['index']) : 0; // 确保索引在有效范围内 if ($currentIndex < 0 || $currentIndex >= count($childAreaCodes)) { $currentIndex = 0; // 超出范围则重置为0 } // 获取当前要使用的城市编码 $currentCityCode = $childAreaCodes[$currentIndex]; // 计算下一个索引(循环) $nextIndex = ($currentIndex + 1) % count($childAreaCodes); // 构建最终请求参数,只传入当前城市编码 $postData = [ 'PAGE_SERIAL_VERSION_UID' => '', 'province' => $areaid, 'city' => $currentCityCode, // 传入当前城市编码 'area' => '', 'object' => $object, 'profitType' => '1', 'businessType' => '1', 'pageNo' => $page, 'pageSize' => 10 ]; // 转换为URL编码的字符串 $postDataString = http_build_query($postData); // 创建cURL资源 $ch = curl_init(); // 设置URL和相应的选项 curl_setopt_array($ch, [ CURLOPT_URL => $url, CURLOPT_POST => true, CURLOPT_POSTFIELDS => $postDataString, CURLOPT_RETURNTRANSFER => true, CURLOPT_HTTPHEADER => [ 'Content-Type: application/x-www-form-urlencoded', 'Referer: https://xwpx.eduyun.cn/tolSpInfo/index', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' ], CURLOPT_COOKIE => 'HWWAFSESID=d82bf97755bdb320d6; HWWAFSESTIME=1755569214521; SESSION=552af9b3-ac7a-4bba-b35a-27978f142f24; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk4YzAxNDY3MTE5YWItMDkxNmMxNmMxNmMxNmMtNGM2NTdiNTgtMjA3MzYwMC0xOThjMDE0NjcxMjExM2QifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%7D', // 忽略SSL证书验证(生产环境建议开启验证) CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false ]); // 执行请求并获取响应 $response = curl_exec($ch); // 检查是否有错误发生 if (curl_errno($ch)) { $result = ['error' => '请求错误: ' . curl_error($ch)]; } else { // 解析响应 $responseData = json_decode($response, true); $result = $responseData ?: ['error' => '无法解析响应数据']; // 添加当前索引和下一个索引信息,方便前端循环调用 $result['current_index'] = $currentIndex; $result['next_index'] = $nextIndex; $result['current_city_code'] = $currentCityCode; $result['total_cities'] = count($childAreaCodes); } // 关闭cURL资源 curl_close($ch); // 格式化输出JSON echo json_encode($result, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE); ?>
这样就可以,执行的参数为:peixun.php?page=[地址参数]&area=[地址参数1]&object=[地址参数2]&type=[地址参数3]&index=[地址参数4]
其中:
地址参数:1-49
地址参数1:1-31
地址参数2:1-5
地址参数3: 1-5
地址参数4:0-30
千行代码,Bug何处藏。 纵使上线又怎样,朝令改,夕断肠。