<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1200 520" width="1200" height="520">
<defs>
<filter id="cardShadow" x="-4%" y="-4%" width="108%" height="112%">
<feDropShadow dx="0" dy="1" stdDeviation="2" flood-color="#202124" flood-opacity="0.06"/>
</filter>
<clipPath id="headerClip"><rect x="10" y="10" width="1180" height="48" rx="12" ry="12"/></clipPath>
<marker id="arrowSolid" viewBox="0 0 10 10" markerWidth="8" markerHeight="8" refX="9" refY="5" orient="auto" markerUnits="userSpaceOnUse">
<path d="M0,1 L9,5 L0,9 Z" fill="#1A73E8" stroke="none"/>
</marker>
<marker id="arrowDash" viewBox="0 0 10 10" markerWidth="8" markerHeight="8" refX="9" refY="5" orient="auto" markerUnits="userSpaceOnUse">
<path d="M0,1 L9,5 L0,9 Z" fill="#5F6368" stroke="none"/>
</marker>
</defs>
<rect x="10" y="10" width="1180" height="500" rx="12" fill="#F8F9FA" stroke="#DADCE0" stroke-width="1"/>
<g clip-path="url(#headerClip)"><rect x="10" y="10" width="1180" height="48" fill="#1A73E8"/></g>
<text x="600" y="42" text-anchor="middle" font-family="Roboto, PingFang SC, Microsoft YaHei, sans-serif" font-size="20" font-weight="700" fill="#FFFFFF">Multimodal SDK 能力全景</text>
<rect x="26" y="68" width="1148" height="148" rx="10" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="44" y="88" font-family="Roboto, PingFang SC, sans-serif" font-size="11" font-weight="700" fill="#5F6368">模块调用关系</text>
<g filter="url(#cardShadow)">
<rect x="1038" y="134" width="120" height="40" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="1098" y="159" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="700" fill="#202124">vLLM</text>
</g>
<g filter="url(#cardShadow)">
<rect x="680" y="118" width="340" height="72" rx="8" fill="#FFFFFF" stroke="#1A73E8" stroke-width="1.2"/>
<text x="850" y="144" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="700" fill="#1A73E8">mm.patcher.vllm</text>
<text x="850" y="164" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10" fill="#5F6368">video_patcher / image_patcher</text>
<text x="850" y="178" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10" fill="#5F6368">qwen2_vl / internvl2 image_processor_patcher</text>
</g>
<g filter="url(#cardShadow)">
<rect x="390" y="118" width="260" height="72" rx="8" fill="#FFFFFF" stroke="#1A73E8" stroke-width="1.2"/>
<text x="520" y="144" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="700" fill="#1A73E8">mm.adapter</text>
<text x="520" y="164" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10" fill="#5F6368">MultimodalQwen2VLImageProcessor</text>
<text x="520" y="178" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10" fill="#5F6368">InternVL2PreProcessor</text>
</g>
<g filter="url(#cardShadow)">
<rect x="44" y="118" width="300" height="72" rx="8" fill="#E8F0FE" stroke="#1A73E8" stroke-width="1.5"/>
<text x="194" y="144" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="700" fill="#1A73E8">mm.acc.wrapper</text>
<text x="194" y="164" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10" fill="#5F6368">面向鲲鹏 CPU</text>
<text x="194" y="178" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10" fill="#5F6368">Image · Video · Tensor · Audio</text>
</g>
<path d="M 390 154 L 344 154" fill="none" stroke="#1A73E8" stroke-width="2" marker-end="url(#arrowSolid)"/>
<text x="367" y="146" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="9" font-weight="600" fill="#1A73E8">调用</text>
<path d="M 680 154 L 650 154" fill="none" stroke="#1A73E8" stroke-width="2" marker-end="url(#arrowSolid)"/>
<text x="665" y="146" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="9" font-weight="600" fill="#1A73E8">patch</text>
<path d="M 850 118 L 850 104 L 194 104 L 194 118" fill="none" stroke="#1A73E8" stroke-width="1.8" marker-end="url(#arrowSolid)"/>
<text x="522" y="98" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="9" font-weight="600" fill="#1A73E8">调用</text>
<path d="M 1020 154 L 1038 154" fill="none" stroke="#5F6368" stroke-width="1.5" stroke-dasharray="5 3" marker-end="url(#arrowDash)"/>
<text x="1029" y="146" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="9" font-weight="600" fill="#5F6368">patch</text>
<text x="44" y="238" font-family="Roboto, PingFang SC, sans-serif" font-size="11" font-weight="700" fill="#5F6368">公开能力</text>
<g id="col-accel">
<rect x="26" y="248" width="276" height="28" rx="6" fill="#1A73E8"/>
<text x="164" y="267" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="600" fill="#FFFFFF">mm.acc.wrapper</text>
<g filter="url(#cardShadow)"><rect x="26" y="284" width="86" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="69" y="302" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">Image.open</text>
<text x="69" y="318" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">图像加载与解码</text></g>
<g filter="url(#cardShadow)"><rect x="118" y="284" width="86" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="161" y="302" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">video_decode</text>
<text x="161" y="318" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">视频帧解码</text></g>
<g filter="url(#cardShadow)"><rect x="210" y="284" width="86" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="253" y="302" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">Tensor</text>
<text x="253" y="318" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">numpy / torch 互转</text></g>
<g filter="url(#cardShadow)"><rect x="26" y="342" width="86" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="69" y="360" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">resize</text>
<text x="69" y="376" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">图像缩放</text></g>
<g filter="url(#cardShadow)"><rect x="118" y="342" width="86" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="161" y="360" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">crop</text>
<text x="161" y="376" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">图像裁剪</text></g>
<g filter="url(#cardShadow)"><rect x="210" y="342" width="86" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="253" y="360" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">to_tensor</text>
<text x="253" y="376" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">转为 NCHW 张量</text></g>
<g filter="url(#cardShadow)"><rect x="26" y="400" width="133" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="92.5" y="418" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">normalize</text>
<text x="92.5" y="434" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">均值方差归一化</text></g>
<g filter="url(#cardShadow)"><rect x="169" y="400" width="133" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="235.5" y="418" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">load_audio</text>
<text x="235.5" y="434" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">音频加载与解码</text></g>
</g>
<g id="col-adapter">
<rect x="316" y="248" width="276" height="28" rx="6" fill="#1A73E8"/>
<text x="454" y="267" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="600" fill="#FFFFFF">mm.adapter</text>
<g filter="url(#cardShadow)"><rect x="316" y="284" width="276" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="454" y="302" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="9.5" font-weight="700" fill="#1A73E8">MultimodalQwen2VLImageProcessor</text>
<text x="454" y="318" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">Qwen2-VL 多模态预处理</text></g>
<g filter="url(#cardShadow)"><rect x="316" y="342" width="276" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="454" y="360" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">InternVL2PreProcessor</text>
<text x="454" y="376" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">InternVL2 动态分块预处理</text></g>
</g>
<g id="col-patcher">
<rect x="606" y="248" width="276" height="28" rx="6" fill="#1A73E8"/>
<text x="744" y="267" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="600" fill="#FFFFFF">mm.patcher.vllm</text>
<g filter="url(#cardShadow)"><rect x="606" y="284" width="131" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="671.5" y="302" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">video_patcher</text>
<text x="671.5" y="318" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">patch vLLM 视频 IO</text></g>
<g filter="url(#cardShadow)"><rect x="751" y="284" width="131" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="816.5" y="302" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">image_patcher</text>
<text x="816.5" y="318" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">patch vLLM 图像 IO</text></g>
<g filter="url(#cardShadow)"><rect x="606" y="342" width="131" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="671.5" y="358" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="7.5" font-weight="700" fill="#1A73E8">qwen2_vl_image_processor_patcher</text>
<text x="671.5" y="376" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">patch Qwen2VL 适配器</text></g>
<g filter="url(#cardShadow)"><rect x="751" y="342" width="131" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="816.5" y="358" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="7.5" font-weight="700" fill="#1A73E8">internvl2_image_processor_patcher</text>
<text x="816.5" y="376" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">patch InternVL2 适配器</text></g>
</g>
<g id="col-frame-selector">
<rect x="896" y="248" width="276" height="28" rx="6" fill="#1A73E8"/>
<text x="1034" y="267" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="12" font-weight="600" fill="#FFFFFF">mm.core.frame_selector</text>
<g filter="url(#cardShadow)"><rect x="896" y="284" width="276" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="1034" y="302" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">KFrameSelector</text>
<text x="1034" y="318" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">关键帧选取</text></g>
<g filter="url(#cardShadow)"><rect x="896" y="342" width="276" height="50" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="1034" y="360" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="10.5" font-weight="700" fill="#1A73E8">KRangFrameSelector</text>
<text x="1034" y="376" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="8.5" fill="#5F6368">范围帧选取</text></g>
</g>
<rect x="26" y="468" width="1148" height="28" rx="8" fill="#FFFFFF" stroke="#DADCE0" stroke-width="1"/>
<text x="600" y="487" text-anchor="middle" font-family="Roboto, PingFang SC, sans-serif" font-size="11" fill="#1A73E8">面向昇腾 NPU • 加速多模态大模型推理预处理 • 覆盖解码 / 变换 / 模型适配全链路</text>
</svg>