lab1

项目相关地址如下:

6.5840 Schedule: Spring 2025

nil.csail.mit.edu/6.5840/2025/notes/l01.txt // 这个Introduction很有用,相当于课程总结了

一关键点

1 项目文件作用分析:

wc.go : 定义map和Reduce逻辑, 通过go build 命令和参数: -buildmode=plugin 加载为.so插件(动态库), 以供worker.go使用
```
go build -buildmode=plugin ../mrapps/wc.go
```

rpc.go : 定义RPC传入和传出参数格式, 设置Unix套接字:

func coordinatorSock() string {
	s := "/var/tmp/5840-mr-"
	s += strconv.Itoa(os.Getuid())
	return s
}

worker.go : 请求任务, 处理任务, 汇报任务完成情况
coordinator.go : 负责初始化Task, 分配task, Task完成情况监视, 作为RPC的服务端提供方法
mrworker.go : 加载wc.so插件, 开启一个Worker
```
go run mrworker.go wc.so
```
mrcoordinator.go : 开启一个协调器,并等待任务全部完成时退出

2 关键点分析:

Worker 和 coordinator 的交互都通过RPC, 交互的结构为Task
整个项目采用一个coordinator对应多个Worker的形式
coordinator :
- 初始化任务
- 分配任务(先分配mapTask,再分配Reducetask)
- 监视任务完成情况
- 多个Worker都通过RPC来向coordinator 获取任务, 对共享数据要加锁

Worker

请求任务(既可以是maptask, 也可以是ReduceTask)
获取任务信息
开启一个go routinue去处理(处理的过程中,比如写入多个文件,打开多个文件, 也可以用多个go routinue)
要等一个Task结束, Worker才能去请求下个任务(可以用sync.WaitGroup)
map创建的中间文件格式: mr-%d(mapTaskId)-%d(bucketId) // bucketId = bucketId := ihash(kv.Key) % nReduce

创建中间文件流程:

// 构造文件名
oname := fmt.Sprintf("mr-%d-%d", mapTaskId, bucketId)

// 使用临时文件避免部分写入问题
tmpFile, err := os.CreateTemp("", "mr-tmp-*")
if err != nil {
	errCh <- fmt.Errorf("cannot create temp file for bucket %d: %v", bucketId, err)
return
}
tmpName := tmpFile.Name()

// 创建 JSON 编码器
enc := json.NewEncoder(tmpFile)

// 写入内容到临时文件（JSON格式）
for _, kv := range kvs {
    if err := enc.Encode(&kv); err != nil {
        tmpFile.Close()
        os.Remove(tmpName)
        errCh <- fmt.Errorf("JSON encode failed for bucket %d: %v", bucketId, err)
        return
	}
}

// 手动刷盘确保数据写入
if err := tmpFile.Sync(); err != nil {
    tmpFile.Close()
    os.Remove(tmpName)
    errCh <- fmt.Errorf("sync failed for bucket %d: %v", bucketId, err)
    return
}

// 关闭文件
if err := tmpFile.Close(); err != nil {
    os.Remove(tmpName)
    errCh <- fmt.Errorf("close temp file failed for bucket %d: %v", bucketId, err)
    return
}

// 原子重命名为最终文件
if err := os.Rename(tmpName, oname); err != nil {
	errCh <- fmt.Errorf("rename temp file failed for bucket %d: %v", bucketId, err)
}

Reduce读取中间文件:

var kvs []KeyValue
	dec := json.NewDecoder(file)

	for {
		var kv KeyValue
		if err := dec.Decode(&kv); err != nil {
			if errors.Is(err, io.EOF) {
				break // 正常结束
			}
			return nil, fmt.Errorf("JSON 解码失败: %w", err)
		}
		kvs = append(kvs, kv)
	}

Reduce最终输出文件格式: mr-out-%d(reduceTaskId)

// 写入格式:
fmt.Fprintf(tempFile, "%v %v\n", intermediate[i].Key, output)

二上代码

wc.go

package main

//
// a word-count application "plugin" for MapReduce.
//
// go build -buildmode=plugin wc.go
//

import (
	"strconv"
	"strings"
	"unicode"

	"6.5840/mr"
)

// The map function is called once for each file of input. The first
// argument is the name of the input file, and the second is the
// file's complete contents. You should ignore the input file name,
// and look only at the contents argument. The return value is a slice
// of key/value pairs.
func Map(filename string, contents string) []mr.KeyValue {
	// function to detect word separators.
	// IsLetter报告一个字符是否是字母
	// rune:32位有符号整形，int32的别名，二者视为同一类型。
	ff := func(r rune) bool { return !unicode.IsLetter(r) }

	// split contents into an array of words.
	// func FieldsFunc(s string, f func(rune) bool) []string
	// 类似Fields，但使用函数f来确定分割符（满足f的unicode码值）。如果字符串全部是分隔符或者是空字符串的话，会返回空切片。
	// func strings.FieldsFunc(s string, f func(rune) bool) []string
	// FieldsFunc 在每次运行满足 f（c） 的 Unicode 码位 c 时拆分字符串 s，并返回 s 切片数组。如果 s 中的所有码位都满足 f（c） 或字符串为空，则返回空 slice。
	words := strings.FieldsFunc(contents, ff)

	kva := []mr.KeyValue{}
	for _, w := range words {
		kv := mr.KeyValue{w, "1"}
		kva = append(kva, kv)
	}
	return kva
}

// The reduce function is called once for each key generated by the map tasks, with a list of all the values created for that key by any map task.
func Reduce(key string, values []string) string {
	// return the number of occurrences of this word.
	// func Itoa(i int) string 把int转成string
	return strconv.Itoa(len(values))
}

rpc.go

package mr

//
// RPC definitions.
//
// remember to capitalize all names.
//

import (
	"os"
	"strconv"
	"time"
)

//
// example to show how to declare the arguments
// and reply for an RPC.
//

type ExampleArgs struct {
	X int
}

type ExampleReply struct {
	Y int
}

// Add your RPC definitions here.
// 任务状态
type TaskStatus int

const (
	Undistribution TaskStatus = iota
	Completed
	Inprocess
)

// 任务类型
type TaskType int

const (
	MapType TaskType = iota
	ReduceType
)

// 请求结果
type RequestResult int

const (
	RequestSuccess RequestResult = iota
	RrquestFail
)

type Task struct {
	TaskId_             int
	TaskStatus_         TaskStatus
	TaskType_           TaskType
	FileName_           string //maptask用
	NReduce_            int    //maptask用
	ExecutionTime_      time.Duration
	TaskStartTime_      time.Time
	NoTask_             bool
	AllTaskisInprocess_ bool
}
type RequestArgs struct {
}
type RequestReply struct {
	Task_          Task
	RequestResult_ RequestResult
}

type ReportArgs struct {
	Task_ Task
}
type ReportReply struct {
}

// Cook up a unique-ish UNIX-domain socket name
// in /var/tmp, for the coordinator.
// Can't use the current directory since
// Athena AFS doesn't support UNIX-domain sockets.
// Unix域套接字是一种进程间通信(IPC)机制，允许同一主机上的进程相互通信。
// 它使用 文件系统路径 作为地址（而非IP+端口），是一种本地通信技术。
// 仅限同一主机
func coordinatorSock() string {
	s := "/var/tmp/5840-mr-"
	s += strconv.Itoa(os.Getuid())
	return s
}

worker.go

package mr

import (
	"encoding/json"
	"errors"
	"fmt"
	"hash/fnv"
	"io"
	"log"
	"net/rpc"
	"os"
	"regexp"
	"sort"
	"strconv"
	"sync"
	"time"
)

// Map functions return a slice of KeyValue.
type KeyValue struct {
	Key   string
	Value string
}

// for sorting by key.
type ByKey []KeyValue

// for sorting by key.
func (a ByKey) Len() int           { return len(a) }
func (a ByKey) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key }

// use ihash(key) % NReduce to choose the reduce task number for each KeyValue emitted by Map.
func ihash(key string) int {
	// func New32a() hash.Hash32
	// 返回一个新的32位FNV-1a的hash.Hash32接口
	// 	type Writer interface {
	// 	Write(p []byte) (n int, err error)
	// }
	// Write方法len(p) 字节数据从p写入底层的数据流。它会返回写入的字节数(0 <= n <= len(p))

	// 	type Hash interface {
	//     // 通过嵌入的匿名io.Writer接口的Write方法向hash中添加更多数据，永远不返回错误
	//     io.Writer  // ******** Hash继承了io.Writer接口
	//     // 返回添加b到当前的hash值后的新切片，不会改变底层的hash状态
	//     Sum(b []byte) []byte
	//     // 重设hash为无数据输入的状态
	//     Reset()
	//     // 返回Sum会返回的切片的长度
	//     Size() int
	//     // 返回hash底层的块大小；Write方法可以接受任何大小的数据，
	//     // 但提供的数据是块大小的倍数时效率更高
	//     BlockSize() int
	// }
	// Hash是一个被所有hash函数实现的公共接口。

	// type Hash32
	// type Hash32 interface {
	//     Hash // ******** Hash32继承Hash接口
	//     Sum32() uint32
	// }
	// Hash32是一个被所有32位hash函数实现的公共接口。
	h := fnv.New32a()
	h.Write([]byte(key))
	return int(h.Sum32() & 0x7fffffff)
}

func mapWorker(filename string, mapf func(string, string) []KeyValue, mapTaskId, nReduce int) {
	file, err := os.Open(filename)
	if err != nil {
		log.Fatalf("cannot open %v", filename)
		// log包实现了简单的日志服务。本包定义了Logger类型，该类型提供了一些格式化输出的方法。
		// 本包也提供了一个预定义的“标准”Logger，可以通过辅助函数Print[f|ln]、Fatal[f|ln]和Panic[f|ln]访问，
		// 比手工创建一个Logger对象更容易使用。Logger会打印每条日志信息的日期、时间，默认输出到标准错误。
		// Fatal系列函数会在写入日志信息后调用os.Exit(1)。Panic系列函数会在写入日志信息后panic。
	}
	// func io.ReadAll(r io.Reader) ([]byte, error)
	// ReadAll 从 r 读取，直到出现错误或 EOF，并返回它读取的数据。
	// 成功的调用返回 err == nil，而不是 err == EOF。
	// 由于 ReadAll 定义为从 src 读取到 EOF，因此它不会将 Read 中的 EOF 视为要报告的错误。
	content, err := io.ReadAll(file)
	if err != nil {
		log.Fatalf("cannot read %v", filename)
	}
	file.Close()

	kva := mapf(filename, string(content))

	// 按 bucketId 分组键值对
	bucketMap := make(map[int][]KeyValue)
	for _, kv := range kva {
		bucketId := ihash(kv.Key) % nReduce
		bucketMap[bucketId] = append(bucketMap[bucketId], kv)
	}

	// 使用 WaitGroup 等待所有 Goroutine 完成
	var wg sync.WaitGroup
	// 错误通道，用于收集 Goroutine 的错误
	errCh := make(chan error, len(bucketMap))

	// 为每个分桶启动一个 Goroutine
	for bucketId, kvs := range bucketMap {
		wg.Add(1)

		// 启动 Goroutine 处理当前分桶
		go func(bucketId int, kvs []KeyValue) {
			defer wg.Done()

			// 构造文件名
			oname := fmt.Sprintf("mr-%d-%d", mapTaskId, bucketId)

			// 使用临时文件避免部分写入问题
			tmpFile, err := os.CreateTemp("", "mr-tmp-*")
			if err != nil {
				errCh <- fmt.Errorf("cannot create temp file for bucket %d: %v", bucketId, err)
				return
			}
			tmpName := tmpFile.Name()

			// 创建 JSON 编码器
			enc := json.NewEncoder(tmpFile)

			// 写入内容到临时文件（JSON格式）
			for _, kv := range kvs {
				if err := enc.Encode(&kv); err != nil {
					tmpFile.Close()
					os.Remove(tmpName)
					errCh <- fmt.Errorf("JSON encode failed for bucket %d: %v", bucketId, err)
					return
				}
			}

			// 手动刷盘确保数据写入
			if err := tmpFile.Sync(); err != nil {
				tmpFile.Close()
				os.Remove(tmpName)
				errCh <- fmt.Errorf("sync failed for bucket %d: %v", bucketId, err)
				return
			}

			// 关闭文件
			if err := tmpFile.Close(); err != nil {
				os.Remove(tmpName)
				errCh <- fmt.Errorf("close temp file failed for bucket %d: %v", bucketId, err)
				return
			}

			// 原子重命名为最终文件
			if err := os.Rename(tmpName, oname); err != nil {
				errCh <- fmt.Errorf("rename temp file failed for bucket %d: %v", bucketId, err)
			}
		}(bucketId, kvs)
	}

	// 等待所有 Goroutine 完成
	wg.Wait()

	// 关闭错误通道
	close(errCh)

	// 检查是否有错误发生
	for err := range errCh {
		if err != nil {
			log.Fatalf("map worker error: %v", err)
		}
	}
}

// reduceWorker 执行 Reduce 阶段的工作
// reducef: Reduce 函数，接收一个键和对应的值列表，返回合并后的结果
// reduceTaskId: Reduce 任务的唯一标识符（对应分桶ID）
func reduceWorker(reducef func(string, []string) string, reduceTaskId int) {
	// 1. 确定当前 Reduce 任务处理的分桶ID
	targetBucket := reduceTaskId
	// log.Printf("开始 Reduce 任务 %d (处理分桶 %d)", reduceTaskId, targetBucket)

	// 2. 读取当前目录下的所有文件
	files, err := os.ReadDir(".")
	if err != nil {
		log.Fatalf("读取目录失败: %v", err)
	}

	// 3. 筛选出属于当前分桶的中间文件（JSON格式）
	var intermediateFiles []string
	pattern := regexp.MustCompile(`^mr-(\d+)-(\d+)$`)
	// ^         # 匹配字符串的开始
	// mr-       # 匹配字面字符 "mr-"
	// (\d+)     # 第1个捕获组：匹配1个或多个数字（0-9）
	// -         # 匹配连字符
	// (\d+)     # 第2个捕获组：匹配1个或多个数字
	// $         # 匹配字符串的结束

	for _, file := range files {
		if file.IsDir() {
			continue
		}

		filename := file.Name()
		match := pattern.FindStringSubmatch(filename)
		// matches 结果：
		// [0] "mr-42-100"  // 完整匹配
		// [1] "42"         // 第1捕获组
		// [2] "100"        // 第2捕获组
		if len(match) == 3 {
			bucketId, err := strconv.Atoi(match[2])
			if err == nil && bucketId == targetBucket {
				intermediateFiles = append(intermediateFiles, filename)
			}
		}
	}

	// log.Printf("找到 %d 个中间文件", len(intermediateFiles))

	// 4. 并行读取所有中间文件内容
	var (
		wg           sync.WaitGroup
		intermediate []KeyValue
		readErrors   []error
		resultChan   = make(chan []KeyValue, len(intermediateFiles))
		errorChan    = make(chan error, len(intermediateFiles))
	)

	// 启动文件读取协程
	for _, filename := range intermediateFiles {
		wg.Add(1)
		go func(fname string) {
			defer wg.Done()
			kvs, err := readJSONIntermediateFile(fname)
			if err != nil {
				// errorChan <- fmt.Errorf("读取文件 %s 失败: %w", fname, err)
				return
			}
			resultChan <- kvs
		}(filename)
	}

	// 等待所有读取完成
	wg.Wait()
	close(resultChan)
	close(errorChan)

	// 收集结果
	for kvs := range resultChan {
		intermediate = append(intermediate, kvs...)
	}

	// 处理错误
	for err := range errorChan {
		readErrors = append(readErrors, err)
	}

	if len(readErrors) > 0 {
		// log.Printf("警告: 读取过程中发生 %d 个错误", len(readErrors))
	}

	// log.Printf("读取完成: 共 %d 个键值对", len(intermediate))

	// 5. 按键排序（为分组处理做准备）
	sort.Sort(ByKey(intermediate))
	// log.Printf("排序完成")

	// 6. 构建文件名
	outputFilename := fmt.Sprintf("mr-out-%d", targetBucket) // 输出也使用 JSON 格式

	// 使用临时文件保证原子性
	tempFile, err := os.CreateTemp(".", fmt.Sprintf("reduce-tmp-%d-*", reduceTaskId))
	if err != nil {
		log.Fatalf("无法创建临时文件: %v", err)
	}
	tempName := tempFile.Name()
	defer func() {
		if err := tempFile.Close(); err != nil {
			log.Printf("关闭临时文件失败: %v", err)
		}
	}()

	// 7. 处理键分组并应用 reduce 函数
	i := 0
	for i < len(intermediate) {
		j := i + 1
		for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key {
			j++
		}

		values := []string{}
		for k := i; k < j; k++ {
			values = append(values, intermediate[k].Value)
		}

		output := reducef(intermediate[i].Key, values)

		fmt.Fprintf(tempFile, "%v %v\n", intermediate[i].Key, output)
		i = j
	}

	// 确保所有数据写入磁盘
	if err := tempFile.Sync(); err != nil {
		log.Printf("同步磁盘失败: %v", err)
		os.Remove(tempName)
		return
	}

	// 8. 原子性地重命名临时文件为最终文件
	if err := os.Rename(tempName, outputFilename); err != nil {
		log.Fatalf("重命名文件失败: %v", err)
	}

	// log.Printf("Reduce 任务 %d 完成: 处理了 %d 个文件, %d 个键值对, 输出到 %s",
	// 	reduceTaskId, len(intermediateFiles), len(intermediate), outputFilename)
}

// 辅助函数：读取 JSON 格式的中间文件
func readJSONIntermediateFile(filename string) ([]KeyValue, error) {
	file, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	defer file.Close()

	var kvs []KeyValue
	dec := json.NewDecoder(file)

	for {
		var kv KeyValue
		if err := dec.Decode(&kv); err != nil {
			if errors.Is(err, io.EOF) {
				break // 正常结束
			}
			return nil, fmt.Errorf("JSON 解码失败: %w", err)
		}
		kvs = append(kvs, kv)
	}

	return kvs, nil
}

// main/mrworker.go calls this function.
func Worker(mapf func(string, string) []KeyValue, reducef func(string, []string) string) {

	// Your worker implementation here.
	// 请求任务, 计时, 处理任务, 判断是否超时,汇报
	for {
		// 请求任务
		args := RequestArgs{}
		reply := RequestReply{}
		ok := call("Coordinator.Request", &args, &reply)
		if ok != nil {
			if ok == io.EOF {
				// fmt.Println("全部任务已经处理完!!!")
				break
			}
			fmt.Printf("call failed!\n")
		}
		if reply.Task_.NoTask_ {
			// fmt.Println("全部任务已经处理完!!!")
			break
		}
		if reply.Task_.AllTaskisInprocess_ {
			time.Sleep(5 * time.Second)
		}
		curTask := Task{}
		curTask.TaskId_ = reply.Task_.TaskId_
		curTask.TaskType_ = reply.Task_.TaskType_
		curTask.TaskStatus_ = reply.Task_.TaskStatus_
		curTask.ExecutionTime_ = reply.Task_.ExecutionTime_
		curTask.TaskStartTime_ = reply.Task_.TaskStartTime_
		// 计时

		// 处理任务
		if curTask.TaskStatus_ != Inprocess {
			continue
		}
		var wg sync.WaitGroup
		wg.Add(1)
		go func(curTask *Task, reply *RequestReply) {
			defer wg.Done()
			if curTask.TaskType_ == MapType {
				filename := reply.Task_.FileName_
				nReduce := reply.Task_.NReduce_
				mapWorker(filename, mapf, curTask.TaskId_, nReduce)
				now := time.Now()

				if now.Sub(curTask.TaskStartTime_) > curTask.ExecutionTime_ {
					// fmt.Println("map task 超时->结束map任务 : mapId", curTask.TaskId_)
					return
				}
				curTask.TaskStatus_ = Completed
				args := ReportArgs{
					Task_: *curTask,
				}
				reply := ReportReply{}
				ok := call("Coordinator.Report", &args, &reply)
				if ok != nil {
					fmt.Printf("call failed!\n")
				}

			} else {
				reduceWorker(reducef, curTask.TaskId_)
				now := time.Now()

				if now.Sub(curTask.TaskStartTime_) > curTask.ExecutionTime_ {
					// fmt.Println("map task 超时->结束map任务 : mapId", curTask.TaskId_)
					return
				}
				curTask.TaskStatus_ = Completed
				args := ReportArgs{
					Task_: *curTask,
				}
				reply := ReportReply{}
				ok := call("Coordinator.Report", &args, &reply)
				if ok != nil {
					fmt.Printf("call failed!\n")
				}
			}

		}(&curTask, &reply)
		wg.Wait()
	}

	// uncomment to send the Example RPC to the coordinator.
	// CallExample()

}

// example function to show how to make an RPC call to the coordinator.
//
// the RPC argument and reply types are defined in rpc.go.
func CallExample() {

	// declare an argument structure.
	args := ExampleArgs{}

	// fill in the argument(s).
	args.X = 99

	// declare a reply structure.
	reply := ExampleReply{}

	// send the RPC request, wait for the reply.
	// the "Coordinator.Example" tells the
	// receiving server that we'd like to call
	// the Example() method of struct Coordinator.
	ok := call("Coordinator.Example", &args, &reply)
	if ok == nil {
		// reply.Y should be 100.
		fmt.Printf("reply.Y %v\n", reply.Y)
	} else {
		fmt.Printf("call failed!\n")
	}
}

// send an RPC request to the coordinator, wait for the response.
// usually returns true.
// returns false if something goes wrong.
func call(rpcname string, args interface{}, reply interface{}) error {
	// c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234")
	sockname := coordinatorSock()
	c, err := rpc.DialHTTP("unix", sockname)
	if err != nil {
		log.Fatal("dialing:", err)
	}
	defer c.Close()

	err = c.Call(rpcname, args, reply)
	return err
}

coordinator.go

package mr

import (
	"log"
	"net"
	"net/http"
	"net/rpc"
	"os"
	"sync"
	"time"
)

var MaptaskNum int

const (
	ExecutionTime = 10 * time.Second
	ReducetaskNum = 10
)

type Coordinator struct {
	// Your definitions here.
	MapTaskNum           int
	ReduceTaskNum        int
	MapTaskArrayLock_    sync.RWMutex
	MapTaskArray_        []*Task
	ReduceTaskArrayLock_ sync.RWMutex
	ReduceTaskArray_     []*Task
}

// Your code here -- RPC handlers for the worker to call.

// an example RPC handler.
//
// the RPC argument and reply types are defined in rpc.go.
func (c *Coordinator) Example(args *ExampleArgs, reply *ExampleReply) error {
	reply.Y = args.X + 1
	return nil
}

func (c *Coordinator) Init(files []string) {
	// log.Printf("[Coordinator] 初始化开始: 共 %d 个Map任务, %d 个Reduce任务\n", len(files), c.ReduceTaskNum)
	mapid := 0
	filenames := files
	MaptaskNum = len(filenames)
	for _, mapTask := range c.MapTaskArray_ {
		mapTask.TaskId_ = mapid
		mapTask.TaskStatus_ = Undistribution
		mapTask.TaskType_ = MapType
		mapTask.FileName_ = filenames[mapid]
		mapTask.NReduce_ = ReducetaskNum
		mapTask.ExecutionTime_ = ExecutionTime
		mapTask.NoTask_ = false
		mapTask.AllTaskisInprocess_ = false
		// log.Printf("初始化Map任务 %d: 状态=%s", mapid, taskStatusToString(mapTask.TaskStatus_))
		mapid++
	}
	reduceid := 0
	for _, reduceTask := range c.ReduceTaskArray_ {
		reduceTask.TaskId_ = reduceid
		reduceTask.TaskStatus_ = Undistribution
		reduceTask.TaskType_ = ReduceType
		reduceTask.ExecutionTime_ = ExecutionTime
		reduceTask.NoTask_ = false
		reduceTask.AllTaskisInprocess_ = false
		// log.Printf("初始化Reduce任务 %d: 状态=%s", reduceid, taskStatusToString(reduceTask.TaskStatus_))
		reduceid++
	}
	log.Println("Init compeleted!!!")
}

func taskStatusToString(status TaskStatus) string {
	switch status {
	case Undistribution:
		return "未分配"
	case Completed:
		return "已完成"
	case Inprocess:
		return "处理中"
	default:
		return "未知状态"
	}
}

func (c *Coordinator) Request(args *RequestArgs, reply *RequestReply) error {
	// log.Println("[Coordinator] 收到任务请求")

	// 检查Map任务状态
	mapStatus := c.MapCompleteStatus()

	if mapStatus == Undistribution {
		// 分配Map任务
		c.MapTaskArrayLock_.Lock()
		for _, mapTask := range c.MapTaskArray_ {
			// log.Printf("mapTask-status: %s", taskStatusToString(mapTask.TaskStatus_))
			if mapTask.TaskStatus_ == Undistribution {
				mapTask.TaskStartTime_ = time.Now()
				mapTask.TaskStatus_ = Inprocess
				mapTask.NoTask_ = false
				mapTask.AllTaskisInprocess_ = false
				reply.Task_ = *mapTask
				// log.Printf("[Coordinator] 分配Map任务: ID=%d, 文件=%s",
				// mapTask.TaskId_, mapTask.FileName_)
				break
			}
		}
		c.MapTaskArrayLock_.Unlock()
	} else if mapStatus == Inprocess {
		// 所有Map任务都在处理中
		// log.Println("[Coordinator] 所有Map任务都在处理中")
		reply.Task_.AllTaskisInprocess_ = true
	} else {
		// Map任务全部完成，开始分配Reduce任务
		reduceStatus := c.ReduceCompleteStatus()

		if reduceStatus == Undistribution {
			// 分配Reduce任务
			c.ReduceTaskArrayLock_.Lock()
			for _, reduceTask := range c.ReduceTaskArray_ {
				if reduceTask.TaskStatus_ == Undistribution {
					reduceTask.TaskStartTime_ = time.Now()
					reduceTask.TaskStatus_ = Inprocess
					reduceTask.NoTask_ = false
					reduceTask.AllTaskisInprocess_ = false
					reply.Task_ = *reduceTask
					// log.Printf("[Coordinator] 分配Reduce任务: ID=%d", reduceTask.TaskId_)
					break
				}
			}
			c.ReduceTaskArrayLock_.Unlock()
		} else if reduceStatus == Inprocess {
			// 所有Reduce任务都在处理中
			// log.Println("[Coordinator] 所有Reduce任务都在处理中")
			reply.Task_.AllTaskisInprocess_ = true
		} else {
			// 所有任务已完成
			// log.Println("[Coordinator] 所有任务已完成")
			reply.Task_.NoTask_ = true
		}
	}

	// log.Println("分配任务完成")
	return nil
}
func (c *Coordinator) Report(args *ReportArgs, reply *ReportReply) error {
	// log.Printf("[Coordinator] 收到任务报告: WorkerID=%d, 任务类型=%s, 任务ID=%d, 状态=%s",
	// 	args.Task_.TaskId_,
	// 	strconv.Itoa(int(args.Task_.TaskType_)),
	// 	args.Task_.TaskId_,
	// 	strconv.Itoa(int(args.Task_.TaskStatus_)))
	if args.Task_.TaskStatus_ == Completed {
		if args.Task_.TaskType_ == MapType {
			c.MapTaskArrayLock_.Lock()
			c.MapTaskArray_[args.Task_.TaskId_].TaskStatus_ = Completed
			c.MapTaskArrayLock_.Unlock()
		} else {
			c.ReduceTaskArrayLock_.Lock()
			c.ReduceTaskArray_[args.Task_.TaskId_].TaskStatus_ = Completed
			c.ReduceTaskArrayLock_.Unlock()
		}
	}
	// log.Println("任务完成情况汇报结束 ... ")
	return nil
}

// start a thread that listens for RPCs from worker.go
func (c *Coordinator) server() {
	rpc.Register(c)
	rpc.HandleHTTP()
	//l, e := net.Listen("tcp", ":1234")
	sockname := coordinatorSock()
	os.Remove(sockname)
	l, e := net.Listen("unix", sockname)
	if e != nil {
		log.Fatal("listen error:", e)
	}
	go http.Serve(l, nil)
}
func (c *Coordinator) MapCompleteStatus() TaskStatus {
	allCompleted := true
	allNotUndistributed := true // 所有任务都不是未分配状态

	c.MapTaskArrayLock_.RLock()
	defer c.MapTaskArrayLock_.RUnlock()

	for _, mapTask := range c.MapTaskArray_ {
		if mapTask.TaskStatus_ != Completed {
			allCompleted = false
		}
		if mapTask.TaskStatus_ == Undistribution {
			allNotUndistributed = false
		}
	}

	if allCompleted {
		return Completed
	}
	if allNotUndistributed {
		return Inprocess
	}
	return Undistribution
}

func (c *Coordinator) ReduceCompleteStatus() TaskStatus {
	allCompleted := true
	allNotUndistributed := true // 所有任务都不是未分配状态

	c.ReduceTaskArrayLock_.RLock()
	defer c.ReduceTaskArrayLock_.RUnlock()

	for _, reduceTask := range c.ReduceTaskArray_ {
		if reduceTask.TaskStatus_ != Completed {
			allCompleted = false
		}
		if reduceTask.TaskStatus_ == Undistribution {
			allNotUndistributed = false
		}
	}

	if allCompleted {
		return Completed
	}
	if allNotUndistributed {
		return Inprocess
	}
	return Undistribution
}

// main/mrcoordinator.go calls Done() periodically to find out if the entire job has finished.
func (c *Coordinator) Done() bool {
	ret := false

	// Your code here.
	if c.MapCompleteStatus() == Completed && c.ReduceCompleteStatus() == Completed {
		ret = true
	}
	return ret
}

func (c *Coordinator) startTimeoutMonitor() {
	// Map任务监控协程
	go func() {
		// log.Println("[超时监控] Map任务监控启动")

		for {
			// 检查是否所有任务已完成
			if c.Done() {
				// log.Println("[超时监控] 所有任务已完成，Map监控退出")
				return
			}

			// 如果Map任务未全部完成，则监控Map任务
			if c.MapCompleteStatus() != Completed {
				c.MapTaskArrayLock_.Lock()
				now := time.Now()
				for _, mapTask := range c.MapTaskArray_ {
					if mapTask.TaskStatus_ == Inprocess &&
						now.Sub(mapTask.TaskStartTime_) > mapTask.ExecutionTime_ {
						mapTask.TaskStatus_ = Undistribution
						// log.Printf("[超时监控] Map任务 %d 超时，重置为未分配", mapTask.TaskId_)
					}
				}
				c.MapTaskArrayLock_.Unlock()
			} else {
				// Map任务全部完成，退出Map监控
				// log.Println("[超时监控] Map任务全部完成，Map监控退出")
				return
			}

			// 每5秒检查一次
			time.Sleep(5 * time.Second)
		}
	}()

	// Reduce任务监控协程（只在Map任务完成后启动）
	go func() {
		// 如果所有任务已完成，直接退出
		if c.Done() {
			// log.Println("[超时监控] 所有任务已完成，Reduce监控未启动")
			return
		}

		// 等待Map任务全部完成
		for c.MapCompleteStatus() != Completed {
			time.Sleep(5 * time.Second)
		}

		if !c.Done() && c.MapCompleteStatus() == Completed {
			// log.Println("[超时监控] Reduce任务监控启动")

			for {
				// 检查是否所有任务已完成
				if c.Done() {
					// log.Println("[超时监控] 所有任务已完成，Reduce监控退出")
					return
				}

				c.ReduceTaskArrayLock_.Lock()
				now := time.Now()
				for _, reduceTask := range c.ReduceTaskArray_ {
					if reduceTask.TaskStatus_ == Inprocess &&
						now.Sub(reduceTask.TaskStartTime_) > reduceTask.ExecutionTime_ {
						reduceTask.TaskStatus_ = Undistribution
						// log.Printf("[超时监控] Reduce任务 %d 超时，重置为未分配", reduceTask.TaskId_)
					}
				}
				c.ReduceTaskArrayLock_.Unlock()

				// 每5秒检查一次
				time.Sleep(5 * time.Second)
			}
		}

	}()
}

// create a Coordinator.
// main/mrcoordinator.go calls this function.
// nReduce is the number of reduce tasks to use.
func MakeCoordinator(files []string, nReduce int) *Coordinator {
	c := Coordinator{
		MapTaskNum:    len(files),
		ReduceTaskNum: nReduce,
	}
	// Your code here.
	// 初始化 MapTaskArray_
	c.MapTaskArray_ = make([]*Task, c.MapTaskNum) // 创建了一个长度为 c.MapTaskNum 的切片，但所有元素都是 nil 指针
	for i := 0; i < c.MapTaskNum; i++ {
		c.MapTaskArray_[i] = &Task{} // 初始化每个元素
	}

	// 初始化 ReduceTaskArray_
	c.ReduceTaskArray_ = make([]*Task, c.ReduceTaskNum)
	for i := 0; i < c.ReduceTaskNum; i++ {
		c.ReduceTaskArray_[i] = &Task{} // 初始化每个元素
	}
	c.Init(files)
	c.startTimeoutMonitor()
	c.server()
	return &c
}

mrworker.go

package main

//
// start a worker process, which is implemented
// in ../mr/worker.go. typically there will be
// multiple worker processes, talking to one coordinator.
//
// go run mrworker.go wc.so
//
// Please do not change this file.
//

import "6.5840/mr"
import "plugin"
import "os"
import "fmt"
import "log"

func main() {
	if len(os.Args) != 2 {
		fmt.Fprintf(os.Stderr, "Usage: mrworker xxx.so\n")
		os.Exit(1)
	}

	mapf, reducef := loadPlugin(os.Args[1])

	mr.Worker(mapf, reducef)
}

// load the application Map and Reduce functions
// from a plugin file, e.g. ../mrapps/wc.so
func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
	p, err := plugin.Open(filename)
	if err != nil {
		log.Fatalf("cannot load plugin %v", filename)
	}
	xmapf, err := p.Lookup("Map")
	if err != nil {
		log.Fatalf("cannot find Map in %v", filename)
	}
	mapf := xmapf.(func(string, string) []mr.KeyValue)
	xreducef, err := p.Lookup("Reduce")
	if err != nil {
		log.Fatalf("cannot find Reduce in %v", filename)
	}
	reducef := xreducef.(func(string, []string) string)

	return mapf, reducef
}

mrcoordinator.go

package main

//
// start the coordinator process, which is implemented
// in ../mr/coordinator.go
//
// go run mrcoordinator.go pg*.txt
//
// Please do not change this file.
//

import "6.5840/mr"
import "time"
import "os"
import "fmt"

func main() {
	if len(os.Args) < 2 {
		fmt.Fprintf(os.Stderr, "Usage: mrcoordinator inputfiles...\n")
		os.Exit(1)
	}

	m := mr.MakeCoordinator(os.Args[1:], 10)
	for m.Done() == false {
		time.Sleep(time.Second)
	}

	time.Sleep(time.Second)
}

posted @ 2025-08-18 20:50 msnthh 阅读(2) 评论(0) 收藏举报

刷新页面返回顶部

msnthh