🧠

移动端LLM

📂 llm ⏱ 3 min 543 words

--- title: "移动端LLM" description: "全面介绍移动端LLM部署技术，包括Core ML、TFLite适配和端侧推理优化，实现手机上的大模型运行。" tags: ["移动端LLM", "Core ML", "TFLite", "端侧推理"] category: "llm" icon: "🧠"

移动端LLM

移动端LLM的挑战

在手机上运行大模型面临诸多挑战：有限的内存（通常8-16GB）、计算能力受限、电池续航要求、以及散热限制。通过模型压缩和硬件加速，可以实现流畅的端侧推理。

Core ML部署（iOS）

模型转换

import coremltools as ct

def convert_to_coreml(torch_model, input_shape=(1, 128)):
    """将PyTorch模型转换为Core ML格式"""
    model = torch_model.eval()
    dummy_input = torch.randn(*input_shape)
    
    traced_model = torch.jit.trace(model, dummy_input)
    
    coreml_model = ct.convert(
        traced_model,
        inputs=[ct.TensorType(name="input_ids", shape=input_shape)],
        outputs=[ct.TensorType(name="logits")],
        minimum_deployment_target=ct.target.iOS16,
        compute_precision=ct.precision.FLOAT16,
    )
    
    coreml_model.save("LLM.mlpackage")
    print("Core ML模型已保存")

# 转换示例
import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2")
convert_to_coreml(model)

iOS推理代码

import CoreML

class MobileLLM {
    let model: MLModel
    
    init(modelPath: String) throws {
        let url = URL(fileURLWithPath: modelPath)
        self.model = try MLModel(contentsOf: url)
    }
    
    func generate(prompt: String, maxTokens: Int = 100) -> String {
        // Tokenize输入
        let inputIds = tokenize(prompt)
        
        // 创建MLMultiArray
        let inputArray = try! MLMultiArray(shape: [1, NSNumber(value: inputIds.count)], dataType: .int32)
        for (i, token) in inputIds.enumerated() {
            inputArray[[0, NSNumber(value: i)] as [NSNumber]] = NSNumber(value: token)
        }
        
        // 推理
        let input = MLDictionaryFeatureProvider(dictionary: ["input_ids": inputArray])
        let output = try! model.prediction(from: input)
        
        // 解码输出
        let logits = output.featureValue(for: "logits")!.multiArrayValue!
        return decode(logits: logits)
    }
    
    private func tokenize(_ text: String) -> [Int] {
        // 简化的tokenization
        return text.unicodeScalars.map { Int($0.value) }
    }
    
    private func decode(logits: MLMultiArray) -> String {
        // 贪心解码
        var result = ""
        for i in 0..<logits.shape[1] {
            var maxVal: Float = -Float.infinity
            var maxIdx = 0
            for j in 0..<logits.shape[2] {
                let val = logits[[0, NSNumber(value: i), NSNumber(value: j)] as [NSNumber]].floatValue
                if val > maxVal {
                    maxVal = val
                    maxIdx = j
                }
            }
            result += String(UnicodeScalar(maxIdx)!)
        }
        return result
    }
}

TFLite部署（Android）

模型转换

import tensorflow as tf

def convert_to_tflite(model_path: str, quantize=True):
    """转换为TFLite格式"""
    converter = tf.lite.TFLiteConverter.from_saved_model(model_path)
    
    if quantize:
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.target_spec.supported_types = [tf.float16]
    
    # 动态范围量化
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS,
        tf.lite.OpsSet.SELECT_TF_OPS,
    ]
    
    tflite_model = converter.convert()
    
    with open('model.tflite', 'wb') as f:
        f.write(tflite_model)
    
    print(f"模型大小: {len(tflite_model) / 1024 / 1024:.1f}MB")

convert_to_tflite("model_saved_model")

Android推理代码

import org.tensorflow.lite.Interpreter
import java.nio.ByteBuffer

class MobileLLM(private val modelPath: String) {
    private val interpreter: Interpreter
    
    init {
        val options = Interpreter.Options().apply {
            setNumThreads(4)  // 使用4个CPU线程
            // setUseNNAPI(true)  // 启用NNAPI加速
        }
        interpreter = Interpreter(loadModelFile(modelPath), options)
    }
    
    fun generate(inputIds: IntArray, maxTokens: Int = 100): IntArray {
        val inputBuffer = ByteBuffer.allocateDirect(inputIds.size * 4)
        inputBuffer.order(java.nio.ByteOrder.nativeOrder())
        inputIds.forEach { inputBuffer.putInt(it) }
        
        val outputBuffer = Array(1) { Array(maxTokens) { FloatArray(32000) } }
        
        interpreter.run(inputBuffer, outputBuffer)
        
        // 贪心解码
        return outputBuffer[0].map { logits ->
            logits.indices.maxByOrNull { logits[it] } ?: 0
        }.toIntArray()
    }
    
    private fun loadModelFile(path: String): ByteBuffer {
        val file = java.io.File(path)
        val buffer = ByteBuffer.allocateDirect(file.length().toInt())
        buffer.order(java.nio.ByteOrder.nativeOrder())
        file.inputStream().channel.read(buffer)
        buffer.rewind()
        return buffer
    }
}

移动端优化技术

内存优化

class MobileOptimizer:
    """移动端模型优化"""
    
    def apply_quantization(self, model, bits=4):
        """应用量化减少内存占用"""
        from torch.quantization import quantize_dynamic
        
        if bits == 8:
            return quantize_dynamic(
                model, {torch.nn.Linear}, dtype=torch.qint8
            )
        elif bits == 4:
            # 使用GPTQ或AWQ进行4bit量化
            return self._apply_gptq(model, bits)
    
    def _apply_gptq(self, model, bits):
        """GPTQ 4bit量化"""
        # 简化实现
        print(f"应用{bits}bit GPTQ量化")
        return model
    
    def prune_model(self, model, sparsity=0.5):
        """模型剪枝"""
        import torch.nn.utils.prune as prune
        
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=sparsity)
        
        return model

推理加速

class InferenceAccelerator:
    """推理加速技术"""
    
    def apply_kv_cache(self, model):
        """应用KV Cache加速自回归生成"""
        # KV Cache避免重复计算
        print("启用KV Cache")
        return model
    
    def apply_speculative_decoding(self, draft_model, target_model):
        """投机解码：小模型草稿 + 大模型验证"""
        def generate(prompt, max_tokens):
            tokens = tokenize(prompt)
            
            for _ in range(max_tokens):
                # 小模型快速生成草稿
                draft_tokens = draft_model.generate(tokens, n=5)
                
                # 大模型并行验证
                target_logits = target_model.forward(draft_tokens)
                
                # 接受/拒绝草稿
                accepted = self._verify(draft_tokens, target_logits)
                tokens.extend(accepted)
            
            return tokens
        
        return generate

移动端LLM应用

mobile_use_cases = {
    '智能助手': {
        'model_size': '1-3B',
        'latency': '<100ms',
        'features': ['文本生成', '问答', '摘要'],
    },
    '输入法': {
        'model_size': '0.5-1B',
        'latency': '<50ms',
        'features': ['联想输入', '纠错', '翻译'],
    },
    '图像理解': {
        'model_size': '1-3B',
        'latency': '<200ms',
        'features': ['图像描述', 'OCR', 'VQA'],
    },
}

最佳实践

选择1-3B参数的模型作为移动端基础
使用Core ML（iOS）或TFLite（Android）原生框架
实施量化和剪枝减少模型大小
利用KV Cache和投机解码加速推理
监控内存和功耗，确保用户体验

﻿--- title: "移动端LLM" description: "全面介绍移动端LLM部署技术，包括Core ML、TFLite适配和端侧推理优化，实现手机上的大模型运行。" tags: ["移动端LLM", "Core ML", "TFLite", "端侧推理"] category: "llm" icon: "🧠"

移动端LLM

移动端LLM的挑战

Core ML部署（iOS）

模型转换

iOS推理代码

TFLite部署（Android）

模型转换

Android推理代码

移动端优化技术

内存优化

推理加速

移动端LLM应用

最佳实践

--- title: "移动端LLM" description: "全面介绍移动端LLM部署技术，包括Core ML、TFLite适配和端侧推理优化，实现手机上的大模型运行。" tags: ["移动端LLM", "Core ML", "TFLite", "端侧推理"] category: "llm" icon: "🧠"