HarmonyOS分布式爬虫实战:Actor模型架构解析
作为在鸿蒙分布式系统中摸爬滚打的开发者,曾用Actor模型构建过日均千万级请求的爬虫系统。本文分享从架构设计到容错优化的实战经验,帮你用Actor模型打造高效稳定的分布式爬虫。
一、核心架构:三角色Actor协同设计
1.1 爬虫节点Actor(负责网页抓取)
actor CrawlerNode {
private var taskQueue: [String] = []
private let aggregator: ActorRef<ResultAggregator>
init(aggregator: ActorRef<ResultAggregator>) {
this.aggregator = aggregator
}
receiver func addTask(url: String) {
taskQueue.append(url)
processTasks()
}
private func processTasks() {
while !taskQueue.isEmpty {
let url = taskQueue.removeFirst()
if let content = fetchPage(url) {
let data = parsePage(content)
aggregator.send(StoreData(data))
} else {
// 失败任务重新入队
taskQueue.append(url)
}
}
}
private func fetchPage(_ url: String) -> String? {
// 带重试的网络请求
for _ in 0..3 {
do {
return Http.get(url).content
} catch {
sleep(1) // 重试间隔
}
}
return nil
}
}
1.2 任务调度器Actor(负载均衡)
actor TaskScheduler {
private var nodes: [ActorRef<CrawlerNode>] = []
private var taskQueue: [String] = []
receiver func register(node: ActorRef<CrawlerNode>) {
nodes.append(node)
dispatchTasks()
}
receiver func addTask(url: String) {
taskQueue.append(url)
dispatchTasks()
}
private func dispatchTasks() {
while !taskQueue.isEmpty {
let node = nodes.min(by: { $0.load < $1.load })!
node.send(addTask(taskQueue.removeFirst()))
}
}
}
1.3 结果聚合器Actor(数据处理)
actor ResultAggregator {
private var dataStore: DataStore
receiver func StoreData(data: [String]) {
dataStore.save(data)
if dataStore.count % 100 == 0 {
flushToDB()
}
}
private func flushToDB() {
// 批量写入数据库
}
}
二、容错机制:断点续爬与异常处理
2.1 断点续爬实现
actor CheckpointManager {
private let db: Database
func saveState(nodes: [ActorRef<CrawlerNode>]) {
var tasks = [String]()
for node in nodes {
tasks.append(contentsOf: node.taskQueue)
}
db.save("crawler_tasks", tasks)
}
func restoreState() -> [String] {
return db.load("crawler_tasks") ?? []
}
}
2.2 异常重试策略
extension CrawlerNode {
private func fetchWithRetry(url: String, retries: Int = 3) -> String? {
if retries == 0 {
log("Failed: \(url)")
return nil
}
do {
return Http.get(url, timeout: 5s).content
} catch {
sleep(1s * retries) // 指数退避
return fetchWithRetry(url, retries-1)
}
}
}
三、性能优化:从代码到架构
3.1 网络IO优化
- 连接池复用:
actor HttpPool {
private let pool: ConnectionPool
func getConnection() -> HttpConnection {
return pool.borrow()
}
func release(connection: HttpConnection) {
pool.release(connection)
}
}
- 并发控制:
actor CrawlerNode {
private let semaphore = Semaphore(5) // 限制并发请求数
private func fetchPage(_ url: String) -> String? {
semaphore.acquire()
defer { semaphore.release() }
// 请求处理...
}
}
3.2 可视化监控
graph TD
A[Prometheus] -->|抓取指标| B(CrawlerNode)
A -->|调度指标| C(TaskScheduler)
A -->|存储指标| D(ResultAggregator)
E[Grafana] --> A
监控关键指标:
- 节点负载(任务处理耗时)
- 网络请求成功率
- 数据存储吞吐量
四、实战部署:从单机到集群
4.1 分布式部署架构
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ 节点1 │ │ 节点2 │ │ 节点3 │
│ (Crawler) │ │ (Crawler) │ │ (Scheduler) │
└─────────────┘ └─────────────┘ └─────────────┘
↑ 消息总线 ↑ 消息总线 ↑
└────────────────┼────────────────┘
┌──────────────────────┐
│ 分布式消息中间件 │
└──────────────────────┘
4.2 集群扩展策略
- 动态扩缩容:
actor ClusterManager {
func scaleOut() {
for _ in 0..3 {
spawn(CrawlerNode(aggregator))
}
}
func scaleIn() {
// 选择低负载节点关闭
}
}
- 故障转移:
actor Scheduler {
receiver func nodeFailed(node: ActorRef<CrawlerNode>) {
nodes.remove(node)
rebalanceTasks()
}
}
五、避坑指南:分布式爬虫的生死线
-
任务重复抓取:
- 用布隆过滤器去重URL,避免重复任务
-
网络拥塞:
- 实现全局请求限速,按域名分桶控制
-
数据一致性:
- 结果聚合器使用幂等存储,避免重复数据

浙公网安备 33010602011771号