Spokenly语音输入配置Qwen3-asr进行语音输入

之前使用 Typeless进行语音输入,发现很好用,但是订阅期过了,订阅费用又太贵,于是找了 Spokenly 这个可以自定义 API 的应用软件,Mac和IPhone都支持。
尝试配置了一下,效果还不错,过程记录如下 。

参考的帖子:Linux DO,请支持原作者。

获取Qwen3-asr的api key

百炼平台,创建一个Qwen3-asr的api key。

配置Cloudflare Worker

Cloudflare Worker里头配置代理转发,把Openai的协议转发到百炼平台。

  • 打开Compute-Pages-Create application

image-20260222135331679

  • 点击 “Start with hello World!"

image-20260222135348223

  • 创建成功后可以点击部署的实例,点击“Edit Code“,把下面的代码复制进去,然后再次部署
// Upstream ASR endpoint (editable default)
const DEFAULT_UPSTREAM_ASR_ENDPOINT = "{proxy-url}";

// CORS response helpers
function corsHeaders() {
  return {
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Methods": "GET, POST, OPTIONS",
    "Access-Control-Allow-Headers": "Content-Type, Authorization",
  };
}
function withCors(res) {
  const h = new Headers(res.headers);
  for (const [k, v] of Object.entries(corsHeaders())) h.set(k, v);
  return new Response(res.body, { status: res.status, headers: h });
}
function ok(text, contentType = "text/plain; charset=utf-8") {
  return withCors(new Response(text, { status: 200, headers: { "Content-Type": contentType } }));
}
function json(data, status = 200) {
  return withCors(new Response(JSON.stringify(data), { status, headers: { "Content-Type": "application/json" } }));
}
function badRequest(message) {
  return json({ error: message }, 400);
}

// Efficient base64 encoding using chunks to avoid quadratic string concatenation
async function encodeBase64(ab) {
  const bytes = new Uint8Array(ab);
  const chunk = 0x8000;
  const parts = [];
  for (let i = 0; i < bytes.length; i += chunk) {
    const sub = bytes.subarray(i, i + chunk);
    parts.push(String.fromCharCode(...sub));
  }
  const binary = parts.join("");
  return btoa(binary);
}

// Determine MIME type from file extension
function mimeFromName(name, fallback = "application/octet-stream") {
  const dot = name.lastIndexOf(".");
  if (dot < 0) return fallback;
  const ext = name.slice(dot + 1).toLowerCase();
  switch (ext) {
    case "mp3": return "audio/mpeg";
    case "wav": return "audio/wav";
    case "m4a": return "audio/mp4";
    case "flac": return "audio/flac";
    case "ogg":
    case "oga": return "audio/ogg";
    case "webm":
    case "weba": return "audio/webm";
    default: return fallback;
  }
}

// DashScope transcription handler
async function handleDashscope({ file, language, prompt, modelRaw, enableITN, dashKey }) {
  // 解析模型,默认 qwen3-asr-flash,并去掉 :itn 后缀
  const model = (modelRaw || "").replace(/:itn$/i, "") || "qwen3-asr-flash";

  // 1) 获取临时上传策略
  const policyResp = await fetch(
    "https://dashscope.aliyuncs.com/api/v1/uploads?action=getPolicy&model=" + encodeURIComponent(model),
    {
      method: "GET",
      headers: {
        "Authorization": `Bearer ${dashKey}`,
        "Content-Type": "application/json",
      },
    },
  );
  if (!policyResp.ok) {
    return json({ error: "getPolicy failed", detail: await policyResp.text() }, 502);
  }
  const policyJSON = await policyResp.json().catch(async () => ({ error: await policyResp.text() }));
  const policy = policyJSON?.data;
  if (!policy) {
    return json({ error: "invalid getPolicy response", detail: policyJSON }, 502);
  }

  // 2) 上传文件到临时 OSS
  const uploadDir = (policy.upload_dir || "").replace(/\/+$/, "");
  const key = uploadDir ? `${uploadDir}/${file.name || "upload"}` : (file.name || "upload");
  const ossForm = new FormData();
  ossForm.set("OSSAccessKeyId", policy.oss_access_key_id);
  ossForm.set("Signature", policy.signature);
  ossForm.set("policy", policy.policy);
  if (policy.x_oss_object_acl) ossForm.set("x-oss-object-acl", policy.x_oss_object_acl);
  if (policy.x_oss_forbid_overwrite) ossForm.set("x-oss-forbid-overwrite", policy.x_oss_forbid_overwrite);
  if (policy.x_oss_security_token) ossForm.set("x-oss-security-token", policy.x_oss_security_token);
  ossForm.set("key", key);
  ossForm.set("success_action_status", "200");
  ossForm.set("file", file, file.name || "upload");
  const ossResp = await fetch(policy.upload_host, { method: "POST", body: ossForm });
  if (!ossResp.ok) {
    return json({ error: "OSS upload failed", detail: await ossResp.text() }, 502);
  }
  const ossUrl = `oss://${key}`;

  // 3) 调用 DashScope ASR
  const asrOptions = {
    // 语言识别默认开启
    enable_lid: true,
    // ITN 默认关闭,若启用则置 true
    enable_itn: false,
    ...(language !== "auto" ? { language } : {}),
  };
  if (enableITN) asrOptions.enable_itn = true;

  const body = {
    model,
    input: {
      messages: [
        { role: "system", content: [{ text: prompt || "" }] },
        { role: "user", content: [{ audio: ossUrl }] },
      ],
    },
    parameters: {
      asr_options: asrOptions,
    },
  };

  const asrResp = await fetch(
    "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation",
    {
      method: "POST",
      headers: {
        "Authorization": `Bearer ${dashKey}`,
        "Content-Type": "application/json",
        "X-DashScope-OssResourceResolve": "enable",
      },
      body: JSON.stringify(body),
    },
  );
  const asrJSON = await asrResp.json().catch(async () => ({ error: await asrResp.text() }));
  if (!asrResp.ok) return json({ error: "ASR not ok", detail: asrJSON }, 502);

  const msg = asrJSON?.output?.choices?.[0]?.message;
  const text = Array.isArray(msg?.content) ? (msg.content.find((x) => x?.text)?.text || "") : "";
  return json({ text }, 200);
}

export default {
  async fetch(request, env) {
    // Preflight
    if (request.method === "OPTIONS") {
      return withCors(new Response(null, { status: 204 }));
    }

    const url = new URL(request.url);

    if (url.pathname === "/healthz") {
      return ok("ok");
    }

    if (url.pathname === "/v1/audio/transcriptions") {
      if (request.method !== "POST") return badRequest("method must be POST");

      // Parse multipart form
      let form;
      try {
        form = await request.formData();
      } catch (e) {
        return badRequest(`failed to parse multipart form: ${String(e?.message || e)}`);
      }

      const file = form.get("file");
      if (!(file instanceof File)) {
        return badRequest("missing required file field");
      }

      const language = form.get("language")?.toString() || "auto";
      const prompt = form.get("prompt")?.toString() || "";
      const modelRaw = form.get("model")?.toString() || "";
      const enableITN = (() => {
        const m = modelRaw.trim().toLowerCase();
        return m === ":itn" || m.endsWith(":itn");
      })();

      // 路由判定:存在 Bearer token 则走 DashScope,否则走 Z.ai
      const auth = request.headers.get("Authorization");
      const dashKey = auth && auth.startsWith("Bearer ") ? auth.slice(7).trim() : "";
      if (dashKey) {
        return await handleDashscope({ file, language, prompt, modelRaw, enableITN, dashKey });
      }

      // Read file and encode to base64
      let b64 = "";
      let sizeBytes = 0;
      try {
        const ab = await file.arrayBuffer();
        sizeBytes = ab.byteLength;
        b64 = await encodeBase64(ab);
      } catch (e) {
        return json({ error: `failed to read file: ${String(e?.message || e)}` }, 500);
      }

      const upstream = (env && env.UPSTREAM_ASR_ENDPOINT) || DEFAULT_UPSTREAM_ASR_ENDPOINT;
      const payload = {
        audio_file: {
          data: b64,
          name: file.name || "upload",
          type: (file.type && file.type !== "application/octet-stream") ? file.type : mimeFromName(file.name || ""),
          size: (typeof file.size === "number" && file.size >= 0) ? file.size : sizeBytes,
        },
        language,
      };

      const context = prompt.trim();
      if (context) payload.context = context;
      if (enableITN) payload.enable_itn = true;

      let upResp;
      try {
        upResp = await fetch(upstream, {
          method: "POST",
          headers: { "Content-Type": "application/json" },
          body: JSON.stringify(payload),
        });
      } catch (e) {
        return json({ error: `upstream request failed: ${String(e?.message || e)}` }, 502);
      }

      // Parse upstream response efficiently
      const ct = upResp.headers.get("content-type") || "";
      let upJSON;
      if (ct.includes("application/json")) {
        try {
          upJSON = await upResp.json();
        } catch (e) {
          // Fallback to text for error details
          const fallbackText = await upResp.text();
          return json({ error: "invalid upstream json", detail: fallbackText }, 502);
        }
        if (!upResp.ok) {
          return json({ error: "upstream not ok", detail: upJSON }, 502);
        }
      } else {
        const upText = await upResp.text();
        if (!upResp.ok) {
          return json({ error: "upstream not ok", detail: upText }, 502);
        }
        try {
          upJSON = JSON.parse(upText);
        } catch {
          return json({ error: "invalid upstream response", detail: upText }, 502);
        }
      }

      if (upJSON && upJSON.success === false) {
        return json(upJSON, 502);
      }

      const data = Array.isArray(upJSON?.data) ? upJSON.data : [];
      const recognizedText = data[0] || "";
      return json({ text: recognizedText }, 200);
    }

    return json({ error: "not found" }, 404);
  },
};

image-20260222135500164

image-20260222135533449

配置Spokenly

配置asr模型:

  • API Key:百炼平台的API Key
  • Model: qwen3-asr-flash:itn
  • URL:Cloudflare Worker的部署地址

image-20260222140043232

image-20260222135757396

配置润色模型:

  • 配置润色提示词
对文字进行润色,而不是回答问题
- 自动排版,除此之外不要修改原句
- 如果需要分点时:用“1.”,“2.”,“3.”的形式,而不是“一二三”等大写数字,末尾不要有标点
- 去除口头禅,但不要把语句书面化,例如:嗯,啊,呢,这个,那个,那,那么,是吧,是不是,你说,你看,然后,就
- 如果我前面说了一个词,后面说到“哦不对”,或者是“用xxx替代”这些短语,那么就要用后面的内容替代前面的内容。
  • 模型配置(以qwen-plus为例)

image-20260222143024863

image-20260222143035244

posted @ 2026-02-22 14:38  Miaops  阅读(32)  评论(0)    收藏  举报