移动端LLM
--- title: "移动端LLM" description: "全面介绍移动端LLM部署技术,包括Core ML、TFLite适配和端侧推理优化,实现手机上的大模型运行。" tags: ["移动端LLM", "Core ML", "TFLite", "端侧推理"] category: "llm" icon: "🧠"
移动端LLM
移动端LLM的挑战
在手机上运行大模型面临诸多挑战:有限的内存(通常8-16GB)、计算能力受限、电池续航要求、以及散热限制。通过模型压缩和硬件加速,可以实现流畅的端侧推理。
Core ML部署(iOS)
模型转换
import coremltools as ct
def convert_to_coreml(torch_model, input_shape=(1, 128)):
"""将PyTorch模型转换为Core ML格式"""
model = torch_model.eval()
dummy_input = torch.randn(*input_shape)
traced_model = torch.jit.trace(model, dummy_input)
coreml_model = ct.convert(
traced_model,
inputs=[ct.TensorType(name="input_ids", shape=input_shape)],
outputs=[ct.TensorType(name="logits")],
minimum_deployment_target=ct.target.iOS16,
compute_precision=ct.precision.FLOAT16,
)
coreml_model.save("LLM.mlpackage")
print("Core ML模型已保存")
# 转换示例
import torch
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")
convert_to_coreml(model)
iOS推理代码
import CoreML
class MobileLLM {
let model: MLModel
init(modelPath: String) throws {
let url = URL(fileURLWithPath: modelPath)
self.model = try MLModel(contentsOf: url)
}
func generate(prompt: String, maxTokens: Int = 100) -> String {
// Tokenize输入
let inputIds = tokenize(prompt)
// 创建MLMultiArray
let inputArray = try! MLMultiArray(shape: [1, NSNumber(value: inputIds.count)], dataType: .int32)
for (i, token) in inputIds.enumerated() {
inputArray[[0, NSNumber(value: i)] as [NSNumber]] = NSNumber(value: token)
}
// 推理
let input = MLDictionaryFeatureProvider(dictionary: ["input_ids": inputArray])
let output = try! model.prediction(from: input)
// 解码输出
let logits = output.featureValue(for: "logits")!.multiArrayValue!
return decode(logits: logits)
}
private func tokenize(_ text: String) -> [Int] {
// 简化的tokenization
return text.unicodeScalars.map { Int($0.value) }
}
private func decode(logits: MLMultiArray) -> String {
// 贪心解码
var result = ""
for i in 0..<logits.shape[1] {
var maxVal: Float = -Float.infinity
var maxIdx = 0
for j in 0..<logits.shape[2] {
let val = logits[[0, NSNumber(value: i), NSNumber(value: j)] as [NSNumber]].floatValue
if val > maxVal {
maxVal = val
maxIdx = j
}
}
result += String(UnicodeScalar(maxIdx)!)
}
return result
}
}
TFLite部署(Android)
模型转换
import tensorflow as tf
def convert_to_tflite(model_path: str, quantize=True):
"""转换为TFLite格式"""
converter = tf.lite.TFLiteConverter.from_saved_model(model_path)
if quantize:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
# 动态范围量化
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS,
tf.lite.OpsSet.SELECT_TF_OPS,
]
tflite_model = converter.convert()
with open('model.tflite', 'wb') as f:
f.write(tflite_model)
print(f"模型大小: {len(tflite_model) / 1024 / 1024:.1f}MB")
convert_to_tflite("model_saved_model")
Android推理代码
import org.tensorflow.lite.Interpreter
import java.nio.ByteBuffer
class MobileLLM(private val modelPath: String) {
private val interpreter: Interpreter
init {
val options = Interpreter.Options().apply {
setNumThreads(4) // 使用4个CPU线程
// setUseNNAPI(true) // 启用NNAPI加速
}
interpreter = Interpreter(loadModelFile(modelPath), options)
}
fun generate(inputIds: IntArray, maxTokens: Int = 100): IntArray {
val inputBuffer = ByteBuffer.allocateDirect(inputIds.size * 4)
inputBuffer.order(java.nio.ByteOrder.nativeOrder())
inputIds.forEach { inputBuffer.putInt(it) }
val outputBuffer = Array(1) { Array(maxTokens) { FloatArray(32000) } }
interpreter.run(inputBuffer, outputBuffer)
// 贪心解码
return outputBuffer[0].map { logits ->
logits.indices.maxByOrNull { logits[it] } ?: 0
}.toIntArray()
}
private fun loadModelFile(path: String): ByteBuffer {
val file = java.io.File(path)
val buffer = ByteBuffer.allocateDirect(file.length().toInt())
buffer.order(java.nio.ByteOrder.nativeOrder())
file.inputStream().channel.read(buffer)
buffer.rewind()
return buffer
}
}
移动端优化技术
内存优化
class MobileOptimizer:
"""移动端模型优化"""
def apply_quantization(self, model, bits=4):
"""应用量化减少内存占用"""
from torch.quantization import quantize_dynamic
if bits == 8:
return quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
elif bits == 4:
# 使用GPTQ或AWQ进行4bit量化
return self._apply_gptq(model, bits)
def _apply_gptq(self, model, bits):
"""GPTQ 4bit量化"""
# 简化实现
print(f"应用{bits}bit GPTQ量化")
return model
def prune_model(self, model, sparsity=0.5):
"""模型剪枝"""
import torch.nn.utils.prune as prune
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
prune.l1_unstructured(module, name='weight', amount=sparsity)
return model
推理加速
class InferenceAccelerator:
"""推理加速技术"""
def apply_kv_cache(self, model):
"""应用KV Cache加速自回归生成"""
# KV Cache避免重复计算
print("启用KV Cache")
return model
def apply_speculative_decoding(self, draft_model, target_model):
"""投机解码:小模型草稿 + 大模型验证"""
def generate(prompt, max_tokens):
tokens = tokenize(prompt)
for _ in range(max_tokens):
# 小模型快速生成草稿
draft_tokens = draft_model.generate(tokens, n=5)
# 大模型并行验证
target_logits = target_model.forward(draft_tokens)
# 接受/拒绝草稿
accepted = self._verify(draft_tokens, target_logits)
tokens.extend(accepted)
return tokens
return generate
移动端LLM应用
mobile_use_cases = {
'智能助手': {
'model_size': '1-3B',
'latency': '<100ms',
'features': ['文本生成', '问答', '摘要'],
},
'输入法': {
'model_size': '0.5-1B',
'latency': '<50ms',
'features': ['联想输入', '纠错', '翻译'],
},
'图像理解': {
'model_size': '1-3B',
'latency': '<200ms',
'features': ['图像描述', 'OCR', 'VQA'],
},
}
最佳实践
- 选择1-3B参数的模型作为移动端基础
- 使用Core ML(iOS)或TFLite(Android)原生框架
- 实施量化和剪枝减少模型大小
- 利用KV Cache和投机解码加速推理
- 监控内存和功耗,确保用户体验