批量采集培训机构数据
最近做一个培训机构学校查询网,发现有个这方面的数据,所以使用php写了这个接口进行查询。
在php环境新建个peixun.php文件,
代码如下:
<?php
// 接口地址
$url = 'https://xwpx.eduyun.cn/tolSpInfo/getSpInfoList';
$page = $_GET['page'] ?? 1; // 提供默认页码,避免未传参数错误
// 定义省级地区编码数组
$numbers = [
110000, 120000, 130000, 140000, 150000,
210000, 220000, 230000, 310000, 320000,
330000, 340000, 350000, 360000, 370000,
410000, 420000, 430000, 440000, 450000,
460000, 500000, 510000, 520000, 530000,
540000, 610000, 620000, 630000, 640000,
650000, 660000
];
// 获取并验证地区参数
$area = $_GET['area'] ?? '';
$areaid = isset($numbers[$area]) ? $numbers[$area] : '';
if (empty($areaid)) {
echo json_encode(['error' => '无效的地区参数'], JSON_UNESCAPED_UNICODE);
exit();
}
/**
* 获取子地区编码列表
* [url=home.php?mod=space&uid=952169]@Param[/url] string $areaid 父地区编码
* [url=home.php?mod=space&uid=155549]@Return[/url] array 子地区编码数组
*/
function getareaid($areaid) {
// 接口URL
$url = 'https://xwpx.eduyun.cn/xspxRegister/getChildArea';
// 请求参数
$data = [
'PAGE_SERIAL_VERSION_UID' => '',
'areaCode' => $areaid
];
// 构建POST数据
$postData = http_build_query($data);
// 创建cURL资源
$ch = curl_init();
// 设置URL和相应的选项
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $postData,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'Referer: https://xwpx.eduyun.cn/tol/toHomePageParentServices',
'User-Agent: mozilla/5.0 (macintosh; intel mac os x 10_15_1) applewebkit/537.36 (khtml, like gecko) brave chrome/78.0.3904.70 safari/537.36 Edg/139.0.0.0',
'Content-Type: application/x-www-form-urlencoded'
],
CURLOPT_COOKIE => 'HWWAFSESID=d82bf97755bdb320d6; HWWAFSESTIME=1755569214521; SESSION=552af9b3-ac7a-4bba-b35a-27978f142f24; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk4YzAxNDY3MTE5YWItMDkxNmMxNmMxNmMxNmMtNGM2NTdiNTgtMjA3MzYwMC0xOThjMDE0NjcxMjExM2QifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%7D; Hm_lvt_c3f009f814f701e8fad8a17f9682ec79=1755596687; HMACCOUNT=497681945067787A; Hm_lpvt_c3f009f814f701e8fad8a17f9682ec79=1755596697',
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false
]);
// 执行cURL请求并获取响应
$response = curl_exec($ch);
// 检查是否有错误发生
if (curl_errno($ch)) {
error_log('cURL错误: ' . curl_error($ch));
curl_close($ch);
return [];
}
// 关闭cURL资源
curl_close($ch);
// 解析JSON响应
$result = json_decode($response, true);
// 检查解析是否成功
if (json_last_error() !== JSON_ERROR_NONE) {
error_log('JSON解析错误: ' . json_last_error_msg());
return [];
}
// 提取并返回子地区编码
$areaCodes = [];
if ($result['retCode'] === '000000' && !empty($result['data'])) {
foreach ($result['data'] as $item) {
if (!empty($item['areaCode'])) {
$areaCodes[] = $item['areaCode'];
}
}
}
return $areaCodes;
}
// 获取子地区编码列表
$childAreaCodes = getareaid($areaid);
// 如果没有子地区,直接退出
if (empty($childAreaCodes)) {
echo json_encode(['error' => '未获取到子地区编码'], JSON_UNESCAPED_UNICODE);
exit();
}
// 处理其他请求参数
$object = $_GET['object'] ?? '';
if ($object == 5) {
$object = "0,1,2,3,4";
}
$profitType = $_GET['type'] ?? '';
// 获取当前循环索引,默认为0
$currentIndex = isset($_GET['index']) ? intval($_GET['index']) : 0;
// 确保索引在有效范围内
if ($currentIndex < 0 || $currentIndex >= count($childAreaCodes)) {
$currentIndex = 0; // 超出范围则重置为0
}
// 获取当前要使用的城市编码
$currentCityCode = $childAreaCodes[$currentIndex];
// 计算下一个索引(循环)
$nextIndex = ($currentIndex + 1) % count($childAreaCodes);
// 构建最终请求参数,只传入当前城市编码
$postData = [
'PAGE_SERIAL_VERSION_UID' => '',
'province' => $areaid,
'city' => $currentCityCode, // 传入当前城市编码
'area' => '',
'object' => $object,
'profitType' => '1',
'businessType' => '1',
'pageNo' => $page,
'pageSize' => 10
];
// 转换为URL编码的字符串
$postDataString = http_build_query($postData);
// 创建cURL资源
$ch = curl_init();
// 设置URL和相应的选项
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $postDataString,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'Content-Type: application/x-www-form-urlencoded',
'Referer: https://xwpx.eduyun.cn/tolSpInfo/index',
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
],
CURLOPT_COOKIE => 'HWWAFSESID=d82bf97755bdb320d6; HWWAFSESTIME=1755569214521; SESSION=552af9b3-ac7a-4bba-b35a-27978f142f24; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk4YzAxNDY3MTE5YWItMDkxNmMxNmMxNmMxNmMtNGM2NTdiNTgtMjA3MzYwMC0xOThjMDE0NjcxMjExM2QifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22198c01467119ab-0916c16c16c16c-4c657b58-2073600-198c0146712113d%22%7D',
// 忽略SSL证书验证(生产环境建议开启验证)
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false
]);
// 执行请求并获取响应
$response = curl_exec($ch);
// 检查是否有错误发生
if (curl_errno($ch)) {
$result = ['error' => '请求错误: ' . curl_error($ch)];
} else {
// 解析响应
$responseData = json_decode($response, true);
$result = $responseData ?: ['error' => '无法解析响应数据'];
// 添加当前索引和下一个索引信息,方便前端循环调用
$result['current_index'] = $currentIndex;
$result['next_index'] = $nextIndex;
$result['current_city_code'] = $currentCityCode;
$result['total_cities'] = count($childAreaCodes);
}
// 关闭cURL资源
curl_close($ch);
// 格式化输出JSON
echo json_encode($result, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
?>
这样就可以,执行的参数为:peixun.php?page=[地址参数]&area=[地址参数1]&object=[地址参数2]&type=[地址参数3]&index=[地址参数4]
其中:
地址参数:1-49
地址参数1:1-31
地址参数2:1-5
地址参数3: 1-5
地址参数4:0-30
千行代码,Bug何处藏。 纵使上线又怎样,朝令改,夕断肠。

浙公网安备 33010602011771号