转载:OSD接收IO流程
转载链接:
https://www.cnblogs.com/yi-mu-xi/p/10282678.html
消息从(pipe.cc) pipe->reader() 处理后,若ms_can_fast_dispatch()就fast_dispatch()(DispatchQueue.cc );
否则 in_q->enqueue()进入队列。
fast_dispatch()
---ms_fast_dispatch()【OSD.cc】
将message转化为OpRequestRef op,后续直接对这个op进行处理
1 //zym 处理client发来的各种消息
2 void OSD::ms_fast_dispatch(Message *m)
3 {
4 FUNCTRACE(cct);
5 //判断osd服务是否正在关闭,若是则减少一个message的引用,引用为0时空间会被释放。
6 if (service.is_stopping()) {
7 m->put();
8 return;
9 }
10
11 // peering event?
12 switch (m->get_type()) {
13 case CEPH_MSG_PING:
14 dout(10) << "ping from " << m->get_source() << dendl;
15 m->put();
16 return;
17 case MSG_MON_COMMAND:
18 handle_command(static_cast<MMonCommand*>(m));
19 return;
20 case MSG_OSD_FORCE_RECOVERY:
21 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
22 return;
23 case MSG_OSD_SCRUB2:
24 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
25 return;
26
27 case MSG_OSD_PG_CREATE2:
28 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
29 case MSG_OSD_PG_QUERY:
30 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
31 case MSG_OSD_PG_NOTIFY:
32 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
33 case MSG_OSD_PG_INFO:
34 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
35 case MSG_OSD_PG_REMOVE:
36 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
37
38 // these are single-pg messages that handle themselves
39 case MSG_OSD_PG_LOG:
40 case MSG_OSD_PG_TRIM:
41 case MSG_OSD_BACKFILL_RESERVE:
42 case MSG_OSD_RECOVERY_RESERVE:
43 {
44 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
45 if (require_osd_peer(pm)) {
46 enqueue_peering_evt(
47 pm->get_spg(),
48 PGPeeringEventRef(pm->get_event()));
49 }
50 pm->put();
51 return;
52 }
53 }
54
55 //将message结构转变成OpRequest结构,有智能指针op指向。
56 //op的类型是 typedef boost::intrusive_ptr<OpRequest> Ref
57 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);//OpTracker op_tracker;
58 {
59 #ifdef WITH_LTTNG
60 osd_reqid_t reqid = op->get_reqid();
61 #endif
62 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
63 reqid.name._num, reqid.tid, reqid.inc);
64 }//tracepoint的作用?
65
66 if (m->trace)
67 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
68
69 // note sender epoch, min req's epoch
70 //获取epoch,什么是epoch?m的继承类是那些?
71 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
72 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
73 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
74
75 //延时执行
76 service.maybe_inject_dispatch_delay();
77
78 //如果不是CEPH_MSG_OSD_OP消息 或者 has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) 直接加入队列
79 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
80 m->get_type() != CEPH_MSG_OSD_OP) {
81 // queue it directly
82 enqueue_op(
83 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
84 std::move(op),
85 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
86 } else {
87 // legacy client, and this is an MOSDOp (the *only* fast dispatch
88 // message that didn't have an explicit spg_t); we need to map
89 // them to an spg_t while preserving delivery order.
90 auto priv = m->get_connection()->get_priv();
91 if (auto session = static_cast<Session*>(priv.get()); session) {
92 std::lock_guard l{session->session_dispatch_lock};
93 op->get();//加计数
94 session->waiting_on_map.push_back(*op);
95 OSDMapRef nextmap = service.get_nextmap_reserved();
96 dispatch_session_waiting(session, nextmap);
97 service.release_map(nextmap);//释放之前预留的osdmap epoch。
98 }
99 }
100 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
101 }
在enqueue_op()中加入OSDService->op_wq队列.该队列时由OSD->SharedOpWQ队列初始化,实际上是保存到了OSD->SharedOpWQ,然后保存到ShardData中等待被处理,然后唤醒处理这个队列的线程,线程处理函数OSD::SharedOpWQ::_process().
1 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
2 {
3 const utime_t stamp = op->get_req()->get_recv_stamp();
4 const utime_t latency = ceph_clock_now() - stamp;
5 const unsigned priority = op->get_req()->get_priority();
6 const int cost = op->get_req()->get_cost();
7 const uint64_t owner = op->get_req()->get_source().num();
8
9 dout(15) << "enqueue_op " << op << " prio " << priority
10 << " cost " << cost
11 << " latency " << latency
12 << " epoch " << epoch
13 << " " << *(op->get_req()) << dendl;
14 op->osd_trace.event("enqueue op");
15 op->osd_trace.keyval("priority", priority);
16 op->osd_trace.keyval("cost", cost);
17 op->mark_queued_for_pg();
18 logger->tinc(l_osd_op_before_queue_op_lat, latency);
19 op_shardedwq.queue( // sdata->scheduler->enqueue(std::move(item))
20 OpQueueItem(
21 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
22 cost, priority, stamp, owner, epoch));
23 }
处理函数void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
1 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
2 {
3 ······
4 OpQueueItem item = sdata->pqueue->dequeue();//出对列
5
6 ·······
7
8 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
9 suicide_interval);
10
11 // take next item
12 auto qi = std::move(slot->to_process.front()); //deque<OpQueueItem> to_process; class OpQueueItem{}在OSD OpQueueItem.h中
13 ············
14 qi.run(osd, sdata, pg, tp_handle);//处理线程 PGOpItem::run
15
16 ·········
17
18 }
1 void PGOpItem::run(
2 OSD *osd,
3 OSDShard *sdata,
4 PGRef& pg,
5 ThreadPool::TPHandle &handle)
6 {
7 osd->dequeue_op(pg, op, handle);
8 pg->unlock();
9 }
10
11 void PGPeeringItem::run(
12 OSD *osd,
13 OSDShard *sdata,
14 PGRef& pg,
15 ThreadPool::TPHandle &handle)
16 {
17 osd->dequeue_peering_evt(sdata, pg.get(), evt, handle);
18 }
19
20 void PGSnapTrim::run()
21
22 void PGScrub::run()
23
24 void PGRecovery::run()
25
26 void PGRecoveryContext::run()
27
28 void PGDelete::run()
osd->dequeue_op() 中调用pg->do_request() // 处理请求 PrimaryLogPG::do_request 【在PrimaryLogPG.cc中】
根据不用的消息类型对op进行处理
1 void PrimaryLogPG::do_request(
2 OpRequestRef& op,
3 ThreadPool::TPHandle &handle)
4 {
5 if (op->osd_trace) {
6 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
7 op->pg_trace.event("do request");
8 }
9 // make sure we have a new enough map
10 auto p = waiting_for_map.find(op->get_source());
11 if (p != waiting_for_map.end()) {
12 // preserve ordering
13 dout(20) << __func__ << " waiting_for_map "
14 << p->first << " not empty, queueing" << dendl;
15 p->second.push_back(op);
16 op->mark_delayed("waiting_for_map not empty");
17 return;
18 }
19 if (!have_same_or_newer_map(op->min_epoch)) {
20 dout(20) << __func__ << " min " << op->min_epoch
21 << ", queue on waiting_for_map " << op->get_source() << dendl;
22 waiting_for_map[op->get_source()].push_back(op);
23 op->mark_delayed("op must wait for map");
24 osd->request_osdmap_update(op->min_epoch); //---更新map
25 return;
26 }
27
28 if (can_discard_request(op)) {//条件成立 return
29 return;
30 }
31
32 // pg-wide backoffs
33 const Message *m = op->get_req();
34 int msg_type = m->get_type();
35 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
36 SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())};
37 if (!session)
38 return; // drop it.
39
40 if (msg_type == CEPH_MSG_OSD_OP) {
41 if (session->check_backoff(cct, info.pgid,
42 info.pgid.pgid.get_hobj_start(), m)) {
43 return;
44 }
45
46 bool backoff =
47 is_down() ||
48 is_incomplete() ||
49 (!is_active() && is_peered());
50 if (g_conf()->osd_backoff_on_peering && !backoff) {
51 if (is_peering()) {
52 backoff = true;
53 }
54 }
55 if (backoff) {
56 add_pg_backoff(session);//稍后处理
57 return;
58 }
59 }
60 // pg backoff acks at pg-level
61 if (msg_type == CEPH_MSG_OSD_BACKOFF) {
62 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
63 if (ba->begin != ba->end) {
64 handle_backoff(op); //处理backoff的pg
65 return;
66 }
67 }
68 }
69
70 if (!is_peered()) {
71 // Delay unless PGBackend says it's ok
72 if (pgbackend->can_handle_while_inactive(op)) {
73 bool handled = pgbackend->handle_message(op);
74 ceph_assert(handled);
75 return;
76 } else {
77 waiting_for_peered.push_back(op);
78 op->mark_delayed("waiting for peered");
79 return;
80 }
81 }
82
83 if (flushes_in_progress > 0) {
84 dout(20) << flushes_in_progress
85 << " flushes_in_progress pending "
86 << "waiting for flush on " << op << dendl;
87 waiting_for_flush.push_back(op);//pg处于flash状态,将op放入等待队列,等待pg变为可用状态
88 op->mark_delayed("waiting for flush");
89 return;
90 }
91
92 ceph_assert(is_peered() && flushes_in_progress == 0);
93 if (pgbackend->handle_message(op))
94 return;
95
96 switch (msg_type) {
97 case CEPH_MSG_OSD_OP:
98 case CEPH_MSG_OSD_BACKOFF:
99 if (!is_active()) {
100 dout(20) << " peered, not active, waiting for active on " << op << dendl;
101 waiting_for_active.push_back(op);
102 op->mark_delayed("waiting for active");
103 return;
104 }
105 switch (msg_type) {
106 case CEPH_MSG_OSD_OP:
107 // verify client features
108 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
109 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
110 osd->reply_op_error(op, -EOPNOTSUPP);
111 return;
112 }
113 do_op(op); //处理op PrimaryLogPG::do_op(OpRequestRef& op) 【PrimaryLogPG.cc 中】
114 break;
115 case CEPH_MSG_OSD_BACKOFF:
116 // object-level backoff acks handled in osdop context
117 handle_backoff(op);
118 break;
119 }
120 break;
121
122 case MSG_OSD_PG_SCAN:
123 do_scan(op, handle);
124 break;
125
126 case MSG_OSD_PG_BACKFILL:
127 do_backfill(op);
128 break;
129
130 case MSG_OSD_PG_BACKFILL_REMOVE:
131 do_backfill_remove(op);
132 break;
133
134 case MSG_OSD_SCRUB_RESERVE:
135 {
136 const MOSDScrubReserve *m =
137 static_cast<const MOSDScrubReserve*>(op->get_req());
138 switch (m->type) {
139 case MOSDScrubReserve::REQUEST:
140 handle_scrub_reserve_request(op);
141 break;
142 case MOSDScrubReserve::GRANT:
143 handle_scrub_reserve_grant(op, m->from);
144 break;
145 case MOSDScrubReserve::REJECT:
146 handle_scrub_reserve_reject(op, m->from);
147 break;
148 case MOSDScrubReserve::RELEASE:
149 handle_scrub_reserve_release(op);
150 break;
151 }
152 }
153 break;
154
155 case MSG_OSD_REP_SCRUB:
156 replica_scrub(op, handle);
157 break;
158
159 case MSG_OSD_REP_SCRUBMAP:
160 do_replica_scrub_map(op);
161 break;
162
163 case MSG_OSD_PG_UPDATE_LOG_MISSING:
164 do_update_log_missing(op);
165 break;
166
167 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
168 do_update_log_missing_reply(op);
169 break;
170
171 default:
172 ceph_abort_msg("bad message type in do_request");
173 }
174 }
1 /** do_op - do an op
2 * pg lock will be held (if multithreaded)
3 * osd_lock NOT held.
4 */
5 void PrimaryLogPG::do_op(OpRequestRef& op)
6 {
7 FUNCTRACE(cct);
8 // NOTE: take a non-const pointer here; we must be careful not to
9 // change anything that will break other reads on m (operator<<).
10 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
11 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
12 if (m->finish_decode()) { //解码什么?
13 op->reset_desc(); // for TrackedOp
14 m->clear_payload();
15 }
16
17 dout(20) << __func__ << ": op " << *m << dendl;
18
19 hobject_t head = m->get_hobj();
20 head.snap = CEPH_NOSNAP;
21
22 if (!info.pgid.pgid.contains(
23 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {//??PG 处于分裂状态
24 derr << __func__ << " " << info.pgid.pgid << " does not contain "
25 << head << " pg_num " << pool.info.get_pg_num() << " hash "
26 << std::hex << head.get_hash() << std::dec << dendl;
27 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
28 << " op " << *m;
29 ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
30 return;
31 }
32
33 bool can_backoff =
34 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
35 SessionRef session;
36 if (can_backoff) {
37 session = static_cast<Session*>(m->get_connection()->get_priv().get());
38 if (!session.get()) {
39 dout(10) << __func__ << " no session" << dendl;
40 return;
41 }
42
43 if (session->check_backoff(cct, info.pgid, head, m)) {
44 return;
45 }
46 }
47
48 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
49 // not implemented.
50 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
51 osd->reply_op_error(op, -EINVAL); //并行执行状态,直接返回失败?
52 return;
53 }
54
55 if (op->rmw_flags == 0) {
56 int r = osd->osd->init_op_flags(op);
57 if (r) {
58 osd->reply_op_error(op, r);
59 return;
60 }
61 }
62
63 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
64 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
65 op->may_read() &&
66 !(op->may_write() || op->may_cache())) {//此时只有副本才能执行操作
67 // balanced reads; any replica will do
68 if (!(is_primary() || is_replica())) {
69 osd->handle_misdirected_op(this, op);
70 return;
71 }
72 } else {
73 // normal case; must be primary
74 if (!is_primary()) {
75 osd->handle_misdirected_op(this, op);
76 return;
77 }
78 }
79
80 if (!op_has_sufficient_caps(op)) { //没有足够cap,则直接返回失败
81 osd->reply_op_error(op, -EPERM);
82 return;
83 }
84
85 if (op->includes_pg_op()) {//op is OpRequest 对于请求中包含对PG的操作 CEPH_OSD_RMW_FLAG_PGOD
86 return do_pg_op(op);//void PrimaryLogPG::do_pg_op(OpRequestRef op)
87 }
88
89 // object name too long?
90 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {//对象长度,如果大于osd_max_object_name_len,则return
91 dout(4) << "do_op name is longer than "
92 << cct->_conf->osd_max_object_name_len
93 << " bytes" << dendl;
94 osd->reply_op_error(op, -ENAMETOOLONG);
95 return;
96 }
97 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {//对象local 名称空间长度,如果大于osd_max_object_name_len,则return
98 dout(4) << "do_op locator is longer than "
99 << cct->_conf->osd_max_object_name_len
100 << " bytes" << dendl;
101 osd->reply_op_error(op, -ENAMETOOLONG);
102 return;
103 }
104 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {//对象local 名称空间长度,如果大于osd_max_object_name_len,则return
105 dout(4) << "do_op namespace is longer than "
106 << cct->_conf->osd_max_object_namespace_len
107 << " bytes" << dendl;
108 osd->reply_op_error(op, -ENAMETOOLONG);
109 return;
110 }
111
112 if (int r = osd->store->validate_hobject_key(head)) {// object的head是否有效
113 dout(4) << "do_op object " << head << " invalid for backing store: "
114 << r << dendl;
115 osd->reply_op_error(op, r);
116 return;
117 }
118
119 // blacklisted?
120 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {//检查op请求的地址是否在OSDMAP的blacklist中
121 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
122 osd->reply_op_error(op, -EBLACKLISTED);
123 return;
124 }
125
126 // order this op as a write?
127 bool write_ordered = op->rwordered(); //是否是写请求
128
129 // discard due to cluster full transition? (we discard any op that
130 // originates before the cluster or pool is marked full; the client
131 // will resend after the full flag is removed or if they expect the
132 // op to succeed despite being full). The except is FULL_FORCE and
133 // FULL_TRY ops, which there is no reason to discard because they
134 // bypass all full checks anyway. If this op isn't write or
135 // read-ordered, we skip.
136 // FIXME: we exclude mds writes for now.
137 if (write_ordered && !(m->get_source().is_mds() ||
138 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
139 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
140 info.history.last_epoch_marked_full > m->get_map_epoch()) {
141 dout(10) << __func__ << " discarding op sent before full " << m << " "
142 << *m << dendl;
143 return;
144 }
145 // mds should have stopped writing before this point.
146 // We can't allow OSD to become non-startable even if mds
147 // could be writing as part of file removals.
148 if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
149 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
150 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
151 return;
152 }
153 int64_t poolid = get_pgid().pool();
154 if (op->may_write()) { //如果是写请求
155
156 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);//获取对应的pool,pool获取失败,直接return?难道没有返回消息?
157 if (!pi) {
158 return;
159 }
160
161 // invalid?
162 if (m->get_snapid() != CEPH_NOSNAP) {
163 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
164 osd->reply_op_error(op, -EINVAL);
165 return;
166 }
167
168 // too big?
169 if (cct->_conf->osd_max_write_size &&
170 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {//写请求的数据大于osd_max_write_size << 20
171 // journal can't hold commit!
172 derr << "do_op msg data len " << m->get_data_len()
173 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
174 << " on " << *m << dendl;
175 osd->reply_op_error(op, -OSD_WRITETOOBIG);
176 return;
177 }
178 }
179
180 dout(10) << "do_op " << *m
181 << (op->may_write() ? " may_write" : "")
182 << (op->may_read() ? " may_read" : "")
183 << (op->may_cache() ? " may_cache" : "")
184 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
185 << " flags " << ceph_osd_flag_string(m->get_flags())
186 << dendl;
187
188 // missing object?
189 if (is_unreadable_object(head)) {//head 有效
190 if (!is_primary()) {// 如果不是主OSD则reply_op_error
191 osd->reply_op_error(op, -EAGAIN);
192 return;
193 }
194 if (can_backoff &&
195 (g_conf()->osd_backoff_on_degraded ||
196 (g_conf()->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
197 add_backoff(session, head, head);
198 maybe_kick_recovery(head);//条件成立,add_backoff(),尝试启动recovery
199 } else {
200 wait_for_unreadable_object(head, op);
201 }
202 return;
203 }
204
205 if (write_ordered) {
206 // degraded object?
207 if (is_degraded_or_backfilling_object(head)) {
208 if (can_backoff && g_conf()->osd_backoff_on_degraded) {
209 add_backoff(session, head, head);
210 maybe_kick_recovery(head);
211 } else {
212 wait_for_degraded_object(head, op);
213 }
214 return;
215 }
216
217 if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
218 dout(20) << __func__ << ": waiting for scrub" << dendl;
219 waiting_for_scrub.push_back(op);
220 op->mark_delayed("waiting for scrub");
221 return;
222 }
223
224 // blocked on snap?
225 //s head在objects_blocked_on_degraded_snap 则将op放入放入waiting_for_degraded_object
226 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
227 blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
228 hobject_t to_wait_on(head);
229 to_wait_on.snap = blocked_iter->second;
230 wait_for_degraded_object(to_wait_on, op);
231 return;
232 }
233
234 //检查head是否在objects_blocked_on_snap_promotion,如果是则将op放入waiting_for_blocked_object
235 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
236 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
237 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
238 return;
239 }
240
241 //检查head是否在objects_blocked_on_cache_full中,如果是则将op放入waiting_for_cache_not_full
242 if (objects_blocked_on_cache_full.count(head)) {
243 block_write_on_full_cache(head, op);
244 return;
245 }
246 }
247
248 // dup/resent?
249 if (op->may_write() || op->may_cache()) {
250 // warning: we will get back *a* request for this reqid, but not
251 // necessarily the most recent. this happens with flush and
252 // promote ops, but we can't possible have both in our log where
253 // the original request is still not stable on disk, so for our
254 // purposes here it doesn't matter which one we get.
255 eversion_t version;
256 version_t user_version;
257 int return_code = 0;
258 bool got = check_in_progress_op(
259 m->get_reqid(), &version, &user_version, &return_code);
260 if (got) {
261 dout(3) << __func__ << " dup " << m->get_reqid()
262 << " version " << version << dendl;
263 if (already_complete(version)) {
264 osd->reply_op_error(op, return_code, version, user_version);
265 } else {
266 dout(10) << " waiting for " << version << " to commit" << dendl;
267 // always queue ondisk waiters, so that we can requeue if needed
268 waiting_for_ondisk[version].emplace_back(op, user_version, return_code);
269 op->mark_delayed("waiting for ondisk");
270 }
271 return;
272 }
273 }
274
275 ObjectContextRef obc;
276 bool can_create = op->may_write();
277 hobject_t missing_oid;
278
279 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
280 hobject_t _oid_head;
281 if (m->get_snapid() == CEPH_SNAPDIR) {
282 _oid_head = m->get_hobj().get_head();
283 }
284 const hobject_t& oid =
285 m->get_snapid() == CEPH_SNAPDIR ? _oid_head : m->get_hobj();
286
287 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
288 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
289 OSDOp& osd_op = *p;
290
291 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
292 if (m->get_snapid() != CEPH_SNAPDIR) {
293 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
294 osd->reply_op_error(op, -EINVAL);
295 return;
296 }
297 } else {
298 if (m->get_snapid() == CEPH_SNAPDIR) {
299 dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
300 osd->reply_op_error(op, -EINVAL);
301 return;
302 }
303 }
304 }
305
306 // io blocked on obc?
307 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
308 maybe_await_blocked_head(oid, op)) {
309 return;
310 }
311
312 int r = find_object_context(
313 oid, &obc, can_create,
314 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
315 &missing_oid);
316
317 // LIST_SNAPS needs the ssc too
318 if (obc &&
319 m->get_snapid() == CEPH_SNAPDIR &&
320 !obc->ssc) {
321 obc->ssc = get_snapset_context(oid, true);
322 }
323
324 if (r == -EAGAIN) {
325 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
326 // we have to wait for the object.
327 if (is_primary()) {
328 // missing the specific snap we need; requeue and wait.
329 ceph_assert(!op->may_write()); // only happens on a read/cache
330 wait_for_unreadable_object(missing_oid, op);
331 return;
332 }
333 } else if (r == 0) {
334 if (is_unreadable_object(obc->obs.oi.soid)) {
335 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
336 << " is unreadable, waiting" << dendl;
337 wait_for_unreadable_object(obc->obs.oi.soid, op);
338 return;
339 }
340
341 // degraded object? (the check above was for head; this could be a clone)
342 if (write_ordered &&
343 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
344 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
345 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
346 << " is degraded, waiting" << dendl;
347 wait_for_degraded_object(obc->obs.oi.soid, op);
348 return;
349 }
350 }
351
352 bool in_hit_set = false;
353 if (hit_set) {
354 if (obc.get()) {
355 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
356 in_hit_set = true;
357 } else {
358 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
359 in_hit_set = true;
360 }
361 if (!op->hitset_inserted) {
362 hit_set->insert(oid);
363 op->hitset_inserted = true;
364 if (hit_set->is_full() ||
365 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
366 hit_set_persist();
367 }
368 }
369 }
370
371 if (agent_state) {
372 if (agent_choose_mode(false, op))
373 return;
374 }
375
376 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
377 if (maybe_handle_manifest(op,
378 write_ordered,
379 obc))
380 return;
381 }
382
383 if (maybe_handle_cache(op,
384 write_ordered,
385 obc,
386 r,
387 missing_oid,
388 false,
389 in_hit_set))
390 return;
391
392 if (r && (r != -ENOENT || !obc)) {
393 // copy the reqids for copy get on ENOENT
394 if (r == -ENOENT &&
395 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
396 fill_in_copy_get_noent(op, oid, m->ops[0]);
397 return;
398 }
399 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
400 if (op->may_write() &&
401 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
402 record_write_error(op, oid, nullptr, r);
403 } else {
404 osd->reply_op_error(op, r);
405 }
406 return;
407 }
408
409 // make sure locator is consistent
410 object_locator_t oloc(obc->obs.oi.soid);
411 if (m->get_object_locator() != oloc) {
412 dout(10) << " provided locator " << m->get_object_locator()
413 << " != object's " << obc->obs.oi.soid << dendl;
414 osd->clog->warn() << "bad locator " << m->get_object_locator()
415 << " on object " << oloc
416 << " op " << *m;
417 }
418
419 // io blocked on obc?
420 if (obc->is_blocked() &&
421 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
422 wait_for_blocked_object(obc->obs.oi.soid, op);
423 return;
424 }
425
426 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
427
428 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);//这里要创建一个OpContext结构,该结构会接管message中的所有ops的操作,ops的操作就是客户端将rbd请求拆分成object的请求。
429
430 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
431 dout(20) << __func__ << ": skipping rw locks" << dendl;
432 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
433 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
434
435 // verify there is in fact a flush in progress
436 // FIXME: we could make this a stronger test.
437 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
438 if (p == flush_ops.end()) {
439 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
440 reply_ctx(ctx, -EINVAL);
441 return;
442 }
443 } else if (!get_rw_locks(write_ordered, ctx)) {
444 dout(20) << __func__ << " waiting for rw locks " << dendl;
445 op->mark_delayed("waiting for rw locks");
446 close_op_ctx(ctx);
447 return;
448 }
449 dout(20) << __func__ << " obc " << *obc << dendl;
450
451 if (r) {
452 dout(20) << __func__ << " returned an error: " << r << dendl;
453 close_op_ctx(ctx);
454 if (op->may_write() &&
455 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
456 record_write_error(op, oid, nullptr, r);
457 } else {
458 osd->reply_op_error(op, r);
459 }
460 return;
461 }
462
463 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
464 ctx->ignore_cache = true;
465 }
466
467 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
468 // This object is lost. Reading from it returns an error.
469 dout(20) << __func__ << ": object " << obc->obs.oi.soid
470 << " is lost" << dendl;
471 reply_ctx(ctx, -ENFILE);
472 return;
473 }
474 if (!op->may_write() &&
475 !op->may_cache() &&
476 (!obc->obs.exists ||
477 ((m->get_snapid() != CEPH_SNAPDIR) &&
478 obc->obs.oi.is_whiteout()))) {
479 // copy the reqids for copy get on ENOENT
480 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
481 fill_in_copy_get_noent(op, oid, m->ops[0]);
482 close_op_ctx(ctx);
483 return;
484 }
485 reply_ctx(ctx, -ENOENT);
486 return;
487 }
488
489 op->mark_started();
490
491 execute_ctx(ctx);
492 utime_t prepare_latency = ceph_clock_now();
493 prepare_latency -= op->get_dequeued_time();
494 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
495 if (op->may_read() && op->may_write()) {
496 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
497 } else if (op->may_read()) {
498 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
499 } else if (op->may_write() || op->may_cache()) {
500 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
501 }
502
503 // force recovery of the oldest missing object if too many logs
504 maybe_force_recovery();
505 }

浙公网安备 33010602011771号