MIT6.824-MapReduce
Lab1: MapReduce(个人纪录版)
(烂完了,深刻感觉到自己有多菜,各个方面)
最初的版本是全部自己写的,测试的时候会出现时而成功,时而失败的情况,重写两遍的我已经不想再看两遍代码了,后面让AI帮忙分析,然后一点点的修改,最后出问题的地方是在Call4Map(),忘记更新那个状态了。
早些时候做的设计重复且没有用(除了对MapReduce的理解问题,还有就是刚接触的时候感觉逻辑复杂,一度不想写,断断续续写了好久,同时对go也不是很熟悉),后面改了好几次才有git上第一次提交的版本,第一次编译的时候错误很多,改了很久。
总体来说,确实没那么复杂,懂了才敢这么说。下面就不对代码做过多分析了, 仅仅说一下自己的思路,方便后面的复习。
关于怎么通信的没有关注,似乎是http.Serve吧,server`函数很重要,后面看看。
Coorderinator
作为一个协调器,需要考虑到维护一些状态,主要就是下面几个功能:
- 分配Map任务
- 对已经完成的Map做标记
- 分配Reduce任务
- 对已经完成的Reduce做标记
剩下的都是完成的事情。其中需要注意的是并发的情况,共享的数据是不能同时访问的,go的锁机制非常舒服,几乎不用怎么考虑锁。
下面这几个变量很重要,注释有说明。这里最初的设计非常失败,这里都是后面修改的,不然看不了。
nReduce int //reduce的任务数量,最终的reduce至少要分配几次
//map
MapTaskNums map[string]int //每一个文件都只能对应一个TaskNum
mapFinNum int //已经完成的Map任务数量
Files map[string]int // 0: unFinished. 1:allocated. 2:completed.每个任务的状态
MapRime map[string]time.Time //任务开始的时间,超时重新分配
//reduce
reduceFin []int // 0: unFinished. 1:allocated. 2:completed.每个任务的状态
reduceFinNum int //已经完成的Reduce任务数量
ReduceRime []time.Time //任务开始的时间,超时重新分配
下面是完整代码,关于时间的我直接AI辅助了,之前没有接触过,AI写例子,我来抄
package mr
import (
"log"
"net"
"net/http"
"net/rpc"
"os"
"sync"
"time"
)
type Coordinator struct {
// Your definitions here.
mu sync.Mutex
nReduce int
//map
MapTaskNums map[string]int // for fileName
mapFinNum int
Files map[string]int // 0: unFinished. 1:allocated. 2:completed.
MapRime map[string]time.Time
//reduce
reduceFin []int // 0: unFinished. 1:allocated. 2:completed.
reduceFinNum int
ReduceRime []time.Time
}
func (c *Coordinator) AllocateMapTask(args *MapArgs, reply *MapReply) error {
reply.FileName, reply.TaskNum = c.allocateMap()
reply.NReduce = c.nReduce
return nil
}
func (c *Coordinator) allocateMap() (string, int) {
c.mu.Lock()
defer c.mu.Unlock()
if c.mapFinNum == len(c.Files) {
return "", -1
}
for filename, allocated := range c.Files {
if allocated == 0 {
c.Files[filename] = 1
c.MapRime[filename] = time.Now()
return filename, c.MapTaskNums[filename]
}
if allocated == 1 && !c.MapRime[filename].IsZero() && time.Since(c.MapRime[filename]) > 10*time.Second {
c.MapRime[filename] = time.Now()
return filename, c.MapTaskNums[filename]
}
}
return "", -2
}
func (c *Coordinator) FinishedMap(args *MapFinArgs, reply *MapFinReply) error {
c.mu.Lock()
defer c.mu.Unlock()
if args.FileName != "" {
if c.Files[args.FileName] != 2 {
c.Files[args.FileName] = 2
c.mapFinNum++
}
if c.mapFinNum == len(c.Files) {
reply.AllFinished = true
}
}
return nil
}
func (c *Coordinator) AllocateReduceTask(args *ReduceArgs, reply *ReduceReply) error {
reply.FileNum = len(c.Files)
reply.ReduceNum = c.allocateReduce()
return nil
}
func (c *Coordinator) allocateReduce() int {
c.mu.Lock()
defer c.mu.Unlock()
// 如果 map 阶段还没完成,则不分配 reduce 任务
if c.mapFinNum != len(c.Files) {
return -1
}
if c.reduceFinNum == c.nReduce {
return -2
}
for idx, allocated := range c.reduceFin {
if allocated == 0 {
c.reduceFin[idx] = 1
c.ReduceRime[idx] = time.Now()
return idx
}
if allocated == 1 && !c.ReduceRime[idx].IsZero() && time.Since(c.ReduceRime[idx]) > 10*time.Second {
c.ReduceRime[idx] = time.Now()
return idx
}
}
return -1
}
func (c *Coordinator) FinishedReduce(args *ReduceFinArgs, reply *ReduceFinReply) error {
c.mu.Lock()
defer c.mu.Unlock()
if c.reduceFin[args.TaskNum] != 2 {
c.reduceFin[args.TaskNum] = 2
c.reduceFinNum++
}
if c.reduceFinNum == c.nReduce {
reply.AllFinished = true
}
return nil
}
// start a thread that listens for RPCs from worker.go
func (c *Coordinator) server() {
rpc.Register(c)
rpc.HandleHTTP()
//l, e := net.Listen("tcp", ":1234")
sockname := coordinatorSock()
os.Remove(sockname)
l, e := net.Listen("unix", sockname)
if e != nil {
log.Fatal("listen error:", e)
}
go http.Serve(l, nil)
}
// main/mrcoordinator.go calls Done() periodically to find out
// if the entire job has finished.
func (c *Coordinator) Done() bool {
c.mu.Lock()
defer c.mu.Unlock()
ret := false
if c.reduceFinNum == c.nReduce {
time.Sleep(5 * time.Second)
ret = true
}
// Your code here.
return ret
}
// create a Coordinator.
// main/mrcoordinator.go calls this function.
// nReduce is the number of reduce tasks to use.
func MakeCoordinator(files []string, nReduce int) *Coordinator {
c := Coordinator{}
c.nReduce = nReduce
c.Files = make(map[string]int)
c.MapTaskNums = make(map[string]int)
// Your code here.
for idx, filename := range files {
c.Files[filename] = 0
c.MapTaskNums[filename] = idx
}
c.reduceFin = make([]int, nReduce)
c.MapRime = make(map[string]time.Time, len(c.Files))
c.ReduceRime = make([]time.Time, nReduce)
c.server()
return &c
}
Worker
工作节点,执行Map、Reduce任务的,现在看来没有什么,最初没有考虑到退出的事情,没有进行无限循环(也是因为对go不熟,并且没有这方面的经验)。其他的还好,就是最初写的Call4Map最后被忽视,一直错误,并且一度觉得自己逻辑正确,怎么就不对呢?
package mr
import (
"encoding/json"
"fmt"
"hash/fnv"
"io/ioutil"
"log"
"net/rpc"
"os"
"sort"
"time"
)
// Map functions return a slice of KeyValue.
type KeyValue struct {
Key string
Value string
}
// for sorting by key.
type ByKey []KeyValue
// for sorting by key.
func (a ByKey) Len() int { return len(a) }
func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key }
// use ihash(key) % NReduce to choose the reduce
// task number for each KeyValue emitted by Map.
func ihash(key string) int {
h := fnv.New32a()
h.Write([]byte(key))
return int(h.Sum32() & 0x7fffffff)
}
func readfile(filename string) string {
file, err := os.Open(filename)
if err != nil {
log.Fatalf("cannot open %v", filename)
}
content, err := ioutil.ReadAll(file)
if err != nil {
log.Fatalf("cannot read %v", filename)
}
file.Close()
return string(content)
}
// main/mrworker.go calls this function.
func Worker(mapf func(string, string) []KeyValue,
reducef func(string, []string) string) {
// Your worker implementation here.
// uncomment to send the Example RPC to the coordinator.
// CallExample()
for {
filename, num4Task, nReduce := Call4Map()
if num4Task == -1 {
break
}
if filename == "" {
time.Sleep(time.Second)
continue
}
// read file content
content := readfile(filename)
// map file and save to file
kva := mapf(filename, content)
ofiles := make([]*os.File, nReduce)
encoders := make([]*json.Encoder, nReduce)
for i := 0; i < nReduce; i++ {
oname := fmt.Sprintf("mr-%d-%d", num4Task, i)
ofiles[i], _ = os.Create(oname)
encoders[i] = json.NewEncoder(ofiles[i])
}
for _, kv := range kva {
reduceIdx := ihash(kv.Key) % nReduce
if err := encoders[reduceIdx].Encode(&kv); err != nil {
fmt.Println("map write to file error!")
}
}
for i := 0; i < nReduce; i++ {
ofiles[i].Close()
}
// completed map task, tell coordinator
args := MapFinArgs{filename}
reply := MapFinReply{}
call("Coordinator.FinishedMap", &args, &reply)
}
for {
fileNum, reduceNum := Call4Reduce()
if reduceNum == -2 {
break
}
if reduceNum == -1 {
time.Sleep(time.Second)
continue
}
kva := []KeyValue{}
for i := 0; i < fileNum; i++ {
taskFile := fmt.Sprintf("mr-%d-%d", i, reduceNum)
file, err := os.Open(taskFile)
if err != nil {
continue
}
dec := json.NewDecoder(file)
for {
var kv KeyValue
if err := dec.Decode(&kv); err != nil {
break
}
kva = append(kva, kv)
}
file.Close()
}
sort.Sort(ByKey(kva))
oname := fmt.Sprintf("mr-out-%d", reduceNum)
ofile, _ := os.Create(oname)
i := 0
for i < len(kva) {
j := i + 1
for j < len(kva) && kva[j].Key == kva[i].Key {
j++
}
values := []string{}
for k := i; k < j; k++ {
values = append(values, kva[k].Value)
}
output := reducef(kva[i].Key, values)
fmt.Fprintf(ofile, "%v %v\n", kva[i].Key, output)
i = j
}
ofile.Close()
args := ReduceFinArgs{reduceNum}
reply := ReduceFinReply{}
call("Coordinator.FinishedReduce", &args, &reply)
}
}
func Call4Map() (string, int, int) {
args := MapArgs{0}
reply := MapReply{}
ok := call("Coordinator.AllocateMapTask", &args, &reply)
if !ok {
return "", -1, 0 //直接返回,退出map阶段
}
return reply.FileName, reply.TaskNum, reply.NReduce
}
func Call4Reduce() (int, int) {
args := ReduceArgs{}
reply := ReduceReply{}
ok := call("Coordinator.AllocateReduceTask", &args, &reply)
if !ok {
return 0, -2 //直接返回,退出reduce阶段
}
return reply.FileNum, reply.ReduceNum
}
// send an RPC request to the coordinator, wait for the response.
// usually returns true.
// returns false if something goes wrong.
func call(rpcname string, args interface{}, reply interface{}) bool {
// c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234")
sockname := coordinatorSock()
c, err := rpc.DialHTTP("unix", sockname)
if err != nil {
log.Fatal("dialing:", err)
}
defer c.Close()
err = c.Call(rpcname, args, reply)
if err == nil {
return true
}
fmt.Println(err)
return false
}
RPC
一个传输过程的传输必要数据的东西,首字母一定要大写,还是有多余的部分,懒得删了,也不知道怎么设计更好,但是必须带个参数,最后带一个AI设计的版本,更好、更优雅。但是还需要动其他的代码。设计这部分还是要好好学习,我的代码真是丑爆了。
package mr
//
// RPC definitions.
//
// remember to capitalize all names.
//
import (
"os"
"strconv"
)
// Add your RPC definitions here.
type MapArgs struct {
N int
}
type MapReply struct {
FileName string
TaskNum int // for filename, -1:结束, -2:忙碌等待,其他:任务号
NReduce int
}
type MapFinArgs struct {
FileName string
}
type MapFinReply struct {
AllFinished bool
}
type ReduceArgs struct {
FileName string
}
type ReduceReply struct {
FileNum int
// -1:map未结束,等待, -2:结束,其他:任务号
ReduceNum int // among of nReduce
}
type ReduceFinArgs struct {
TaskNum int
}
type ReduceFinReply struct {
AllFinished bool
}
// Cook up a unique-ish UNIX-domain socket name
// in /var/tmp, for the coordinator.
// Can't use the current directory since
// Athena AFS doesn't support UNIX-domain sockets.
func coordinatorSock() string {
s := "/var/tmp/5840-mr-"
s += strconv.Itoa(os.Getuid())
return s
}
AI
Worker
func Worker(mapf func(string, string) []KeyValue,
reducef func(string, []string) string) {
// ========== Map 阶段 ==========
for {
reply := requestMapTask()
switch reply.Status {
case TaskStatusAllDone:
// Map 阶段全部完成
break
case TaskStatusWait:
// 暂时没有任务,等待
time.Sleep(time.Second)
continue
case TaskStatusReady:
// 执行 Map 任务
executeMapTask(mapf, reply)
}
if reply.Status == TaskStatusAllDone {
break
}
}
// ========== Reduce 阶段 ==========
for {
reply := requestReduceTask()
switch reply.Status {
case TaskStatusAllDone:
// Reduce 阶段全部完成,Worker 退出
return
case TaskStatusWait:
// 暂时没有任务,等待
time.Sleep(time.Second)
continue
case TaskStatusReady:
// 执行 Reduce 任务
executeReduceTask(reducef, reply)
}
}
}
func requestMapTask() AllocateMapTaskReply {
args := AllocateMapTaskArgs{}
reply := AllocateMapTaskReply{}
ok := call("Coordinator.AllocateMapTask", &args, &reply)
if !ok {
// RPC 失败,Coordinator 可能已退出
reply.Status = TaskStatusAllDone
}
return reply
}
func executeMapTask(mapf func(string, string) []KeyValue, task AllocateMapTaskReply) {
// 读取文件
content := readfile(task.FileName)
kva := mapf(task.FileName, content)
// 写入中间文件
writeIntermediateFiles(kva, task.TaskNum, task.NReduce)
// 报告完成
args := ReportMapTaskArgs{
FileName: task.FileName,
TaskNum: task.TaskNum,
Version: task.Version,
}
reply := ReportMapTaskReply{}
call("Coordinator.ReportMapTask", &args, &reply)
}
func requestReduceTask() AllocateReduceTaskReply {
args := AllocateReduceTaskArgs{}
reply := AllocateReduceTaskReply{}
ok := call("Coordinator.AllocateReduceTask", &args, &reply)
if !ok {
// RPC 失败
reply.Status = TaskStatusAllDone
}
return reply
}
func executeReduceTask(reducef func(string, []string) string, task AllocateReduceTaskReply) {
// 读取中间文件
kva := readIntermediateFiles(task.ReduceNum, task.NMapTasks)
// 排序
sort.Sort(ByKey(kva))
// 写入输出文件
writeOutputFile(reducef, kva, task.ReduceNum)
// 报告完成
args := ReportReduceTaskArgs{
ReduceNum: task.ReduceNum,
Version: task.Version,
}
reply := ReportReduceTaskReply{}
call("Coordinator.ReportReduceTask", &args, &reply)
}
coordinator
type Coordinator struct {
mu sync.Mutex
// Map 任务相关
nMapTasks int
mapTasks map[string]*TaskInfo // filename -> task info
mapFinished int
// Reduce 任务相关
nReduce int
reduceTasks []*TaskInfo
reduceFinished int
}
type TaskInfo struct {
status int // 0: idle, 1: running, 2: finished
version int // 任务版本号
startTime time.Time // 任务开始时间
}
func (c *Coordinator) AllocateMapTask(args *AllocateMapTaskArgs, reply *AllocateMapTaskReply) error {
c.mu.Lock()
defer c.mu.Unlock()
// 所有任务已完成
if c.mapFinished == c.nMapTasks {
reply.Status = TaskStatusAllDone
return nil
}
// 寻找可分配的任务
for filename, task := range c.mapTasks {
if task.status == 0 || (task.status == 1 && time.Since(task.startTime) > 10*time.Second) {
// 分配任务
task.status = 1
task.version++
task.startTime = time.Now()
reply.Status = TaskStatusReady
reply.FileName = filename
reply.TaskNum = getTaskNum(filename) // 你的 MapTaskNums
reply.NReduce = c.nReduce
reply.Version = task.version
return nil
}
}
// 所有任务都在执行中
reply.Status = TaskStatusWait
return nil
}
func (c *Coordinator) ReportMapTask(args *ReportMapTaskArgs, reply *ReportMapTaskReply) error {
c.mu.Lock()
defer c.mu.Unlock()
task := c.mapTasks[args.FileName]
// 检查版本号,只接受最新版本
if task.version == args.Version && task.status != 2 {
task.status = 2
c.mapFinished++
reply.Accept = true
} else {
reply.Accept = false
}
return nil
}
// Reduce 任务类似...
rpc
package mr
// 任务状态枚举
type TaskStatus int
const (
TaskStatusReady TaskStatus = 0 // 任务可执行
TaskStatusWait TaskStatus = 1 // 暂时无任务,等待
TaskStatusAllDone TaskStatus = 2 // 所有任务完成
)
// ========== Map Task RPC ==========
type AllocateMapTaskArgs struct {
// Worker 请求任务时不需要参数
}
type AllocateMapTaskReply struct {
Status TaskStatus // 任务状态
FileName string // 要处理的文件名
TaskNum int // Map 任务编号
NReduce int // Reduce 任务总数
Version int // 任务版本号(防止重复执行)
}
type ReportMapTaskArgs struct {
FileName string // 完成的文件名
TaskNum int // 任务编号
Version int // 任务版本号
}
type ReportMapTaskReply struct {
Accept bool // Coordinator 是否接受这次完成报告
}
// ========== Reduce Task RPC ==========
type AllocateReduceTaskArgs struct {
// Worker 请求任务时不需要参数
}
type AllocateReduceTaskReply struct {
Status TaskStatus // 任务状态
ReduceNum int // Reduce 任务编号
NMapTasks int // Map 任务总数(用于读取中间文件)
Version int // 任务版本号
}
type ReportReduceTaskArgs struct {
ReduceNum int // 完成的 Reduce 任务编号
Version int // 任务版本号
}
type ReportReduceTaskReply struct {
Accept bool // Coordinator 是否接受这次完成报告
}
// ========== Example RPC (可删除) ==========
type ExampleArgs struct {
X int
}
type ExampleReply struct {
Y int
}

浙公网安备 33010602011771号