上一篇文章完成了llama.cpp Android的编译和相关so的加载,这一篇文章基于so加载的基础上,实现大模型的加载,目的是将量化后的模型在手机上运行起来。
1. Kotlin 桥接类:Llama.kt
app/src/main/java/com/example/llamatest/Llama.kt中
object Llama { init { System.loadLibrary("ggml") System.loadLibrary("llama") } external fun loadModel(path: String): Boolean external fun unloadModel() external fun chat(prompt: String): String }2. JNI C++ 完整实现:llama_wrapper.cpp
主要实现了三个JNI接口:
模型加载:loadModel
文本生成:generate
模型内存释放:releaseModel
app/src/main/cpp/llama_wrapper.cpp中
#include <jni.h> #include <string> #include <vector> #include <android/log.h> #ifdef __cplusplus extern "C" { #endif #include "llama.h" #ifdef __cplusplus } #endif #define LOGD(...) __android_log_print(ANDROID_LOG_INFO, "LLAMA_FIX", __VA_ARGS__) // 全局变量 static llama_model* g_model = nullptr; static llama_context* g_ctx = nullptr; static const llama_vocab* g_vocab = nullptr; //============================================= // 加载模型 //============================================= extern "C" JNIEXPORT jboolean JNICALL Java_com_example_llamatest_MainActivity_loadModel( JNIEnv* env, jobject /* thiz */, jstring modelPath) { if (g_ctx) { llama_free(g_ctx); g_ctx = nullptr; } if (g_model) { llama_model_free(g_model); g_model = nullptr; } g_vocab = nullptr; const char* path = env->GetStringUTFChars(modelPath, nullptr); llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = 0; g_model = llama_model_load_from_file(path, mparams); env->ReleaseStringUTFChars(modelPath, path); if (!g_model) return JNI_FALSE; g_vocab = llama_model_get_vocab(g_model); llama_context_params cparams = llama_context_default_params(); cparams.n_ctx = 1024; cparams.n_threads = 1; g_ctx = llama_init_from_model(g_model, cparams); return g_ctx ? JNI_TRUE : JNI_FALSE; } //============================================= // 采样 token //============================================= static llama_token sample_token() { float* logits = llama_get_logits_ith(g_ctx, -1); int n_vocab = llama_vocab_n_tokens(g_vocab); int best = 0; float max_logit = -1e9; for (int i = 0; i < n_vocab; i++) { if (logits[i] > max_logit) { max_logit = logits[i]; best = i; } } return (llama_token)best; } //============================================= // 生成:绝对不 free batch! //============================================= extern "C" JNIEXPORT jstring JNICALL Java_com_example_llamatest_MainActivity_generate( JNIEnv* env, jobject thiz, jstring prompt) { if (!g_ctx || !g_model || !g_vocab) { return env->NewStringUTF("模型未加载"); } const char* prompt_c = env->GetStringUTFChars(prompt, nullptr); std::string input = "<start_of_turn>user\n"; input += prompt_c; input += "<end_of_turn>\n<start_of_turn>model\n"; env->ReleaseStringUTFChars(prompt, prompt_c); std::vector<llama_token> tokens(512); int n_tokens = llama_tokenize( g_vocab, input.c_str(), (int)input.size(), tokens.data(), 512, true, false ); if (n_tokens <= 0) { return env->NewStringUTF("分词失败"); } // 推理提示词:不调用 llama_batch_free llama_batch batch = llama_batch_get_one(tokens.data(), n_tokens); llama_decode(g_ctx, batch); std::string result; const int MAX_GEN = 32; const llama_token eos = llama_vocab_eos(g_vocab); for (int i = 0; i < MAX_GEN; i++) { llama_token token = sample_token(); if (token == eos || token == 0) break; char buf[256] = {0}; llama_token_to_piece(g_vocab, token, buf, sizeof(buf)-1, 0, false); result += buf; // 推理下一个词:不调用 llama_batch_free llama_batch b = llama_batch_get_one(&token, 1); llama_decode(g_ctx, b); } return env->NewStringUTF(result.c_str()); } //============================================= // 释放模型 //============================================= extern "C" JNIEXPORT void JNICALL Java_com_example_llamatest_MainActivity_releaseModel( JNIEnv* env, jobject /* thiz */) { if (g_ctx) { llama_free(g_ctx); g_ctx = nullptr; } if (g_model) { llama_model_free(g_model); g_model = nullptr; } g_vocab = nullptr; }3. 布局:activity_main.xml
res/layout/activity_main.xml中
<?xml version="1.0" encoding="utf-8"?> <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android" android:layout_width="match_parent" android:layout_height="match_parent" android:gravity="center" android:orientation="vertical" android:padding="16dp"> <!-- 🔥 注意 ID 已经改成下划线风格:tv_result --> <TextView android:id="@+id/tv_result" android:layout_width="match_parent" android:layout_height="wrap_content" android:textSize="24sp" android:textColor="#FF0000" android:textStyle="bold" android:gravity="center" android:minHeight="300dp" /> <!-- 🔥 注意 ID 已经改成下划线风格:et_input --> <EditText android:id="@+id/et_input" android:layout_width="match_parent" android:layout_height="wrap_content" android:layout_marginTop="20dp" /> <Button android:id="@+id/btn_select_model" android:layout_width="match_parent" android:layout_height="wrap_content" android:text="选择模型" /> <Button android:id="@+id/btn_send" android:layout_width="match_parent" android:layout_height="wrap_content" android:text="发送" /> </LinearLayout>4. Activity 完整逻辑:MainActivity.kt
package com.example.llamatest import android.app.Activity import android.content.Intent import android.net.Uri import android.os.Bundle import android.os.Handler import android.os.Looper import android.util.Log import android.widget.EditText import android.widget.TextView import java.io.File import java.io.FileOutputStream class MainActivity : Activity() { private lateinit var tvResult: TextView private lateinit var etInput: EditText private lateinit var btnSelectModel: android.widget.Button private lateinit var btnSend: android.widget.Button private val REQUEST_FILE = 100 private val uiHandler = Handler(Looper.getMainLooper()) // 🔴 新增:模型加载状态标志 private var isModelLoaded = false external fun loadModel(modelPath: String): Boolean external fun generate(prompt: String): String external fun releaseModel() companion object { private const val TAG = "LLAMA_DEBUG_FINAL" init { try { Log.d(TAG, "【初始化】加载库:llama_jni") System.loadLibrary("llama_jni") Log.d(TAG, "【初始化】✅ 库加载成功") } catch (e: Exception) { Log.e(TAG, "【初始化】❌ 库加载失败", e) } } } override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) setContentView(R.layout.activity_main) Log.d(TAG, "【生命周期】✅ onCreate") tvResult = findViewById(R.id.tv_result) etInput = findViewById(R.id.et_input) btnSelectModel = findViewById(R.id.btn_select_model) btnSend = findViewById(R.id.btn_send) updateUIText("👉 请选择模型文件") Log.d(TAG, "【界面】初始化完成") btnSelectModel.setOnClickListener { Log.d(TAG, "【点击】👉 选择模型") updateUIText("📂 打开文件选择器...") val intent = Intent(Intent.ACTION_OPEN_DOCUMENT).apply { addCategory(Intent.CATEGORY_OPENABLE) type = "*/*" } startActivityForResult(intent, REQUEST_FILE) } btnSend.setOnClickListener { // 🔴 核心修复:先校验模型状态,再执行生成 if (!isModelLoaded) { updateUIText("❌ 请先选择并加载模型!") return@setOnClickListener } val prompt = etInput.text.toString().trim() if (prompt.isEmpty()) { updateUIText("请输入问题") return@setOnClickListener } Log.d(TAG, "【点击】👉 发送问题:$prompt") updateUIText("你:$prompt\n\n💬 AI 思考中...") Thread { try { Log.d(TAG, "【JNI】👉 调用 generate()") val reply = generate(prompt) Log.d(TAG, "【JNI】✅ 生成结果:$reply") uiHandler.post { updateUIText("你:$prompt\n\n🤖 AI:$reply") etInput.setText("") } } catch (e: Exception) { Log.e(TAG, "【JNI】❌ 生成失败", e) uiHandler.post { updateUIText("错误:${e.message}") } } }.start() } } override fun onDestroy() { super.onDestroy() try { Log.d(TAG, "【生命周期】✅ 页面销毁,释放模型资源") releaseModel() isModelLoaded = false } catch (e: Exception) { Log.e(TAG, "【生命周期】❌ 释放模型失败", e) } } override fun onActivityResult(requestCode: Int, resultCode: Int, data: Intent?) { super.onActivityResult(requestCode, resultCode, data) Log.d(TAG, "【回调】✅ onActivityResult") if (requestCode == REQUEST_FILE && resultCode == RESULT_OK) { val uri = data?.data ?: return Log.d(TAG, "【文件】✅ 选中:$uri") uiHandler.post { updateUIText("✅ 已选择模型\n开始复制...") Log.d(TAG, "【UI】显示:开始复制") } Thread { val file = File(filesDir, "gemma.gguf") Log.d(TAG, "【文件】目标路径:${file.absolutePath}") try { Log.d(TAG, "【文件】👉 开始复制...") contentResolver.openInputStream(uri)?.use { input -> FileOutputStream(file).use { output -> input.copyTo(output) } } Log.d(TAG, "【文件】✅ 复制完成") uiHandler.post { updateUIText("✅ 复制完成\n3秒后加载模型...") Log.d(TAG, "【UI】显示:复制完成") } Thread.sleep(3000) Log.d(TAG, "【JNI】👉 开始调用 loadModel()") val success = loadModel(file.absolutePath) Log.d(TAG, "【JNI】✅ loadModel 返回:$success") uiHandler.post { if (success) { isModelLoaded = true updateUIText("🎉 模型加载成功!可以聊天了!") Log.d(TAG, "【UI】✅ 加载成功,状态置为 true") } else { isModelLoaded = false updateUIText("❌ 模型加载失败") Log.d(TAG, "【UI】❌ 加载失败,状态置为 false") } } } catch (e: Exception) { Log.e(TAG, "【异常】❌ 执行失败", e) uiHandler.post { updateUIText("错误:${e.message}") } } }.start() } } private fun updateUIText(s: String) { runOnUiThread { tvResult.text = s tvResult.postInvalidate() Log.d(TAG, "【UI】刷新文字:$s") } } }5. CMakeLists.txt(必须加)
app/src/main/cpp/CMakeLists.txt中
cmake_minimum_required(VERSION 3.22.1) project(llamatest) # 导入预编译的 libllama.so & libggml.so add_library(llama SHARED IMPORTED) set_target_properties(llama PROPERTIES IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/../jniLibs/${ANDROID_ABI}/libllama.so) add_library(ggml SHARED IMPORTED) set_target_properties(ggml PROPERTIES IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/../jniLibs/${ANDROID_ABI}/libggml.so) # 头文件路径 target_include_directories(llama INTERFACE ${CMAKE_SOURCE_DIR}/llama) target_include_directories(ggml INTERFACE ${CMAKE_SOURCE_DIR}/llama) # 🔥 🔥 🔥 这里必须用 llama_wrapper.cpp add_library(llama_jni SHARED llama_wrapper.cpp ) # 头文件 target_include_directories(llama_jni PRIVATE ${CMAKE_SOURCE_DIR}/llama ) # 链接库 target_link_libraries(llama_jni llama ggml android log )6. 关键配置:build.gradle.kts(app 模块)
plugins { alias(libs.plugins.android.application) alias(libs.plugins.kotlin.compose) } android { namespace = "com.example.llamatest" compileSdk { version = release(36) { minorApiLevel = 1 } } defaultConfig { applicationId = "com.example.llamatest" minSdk = 35 targetSdk = 36 versionCode = 1 versionName = "1.0" testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" // ========== 新增:CMake 配置 ========== externalNativeBuild { cmake { cppFlags += "-std=c++17" } } // ========== 新增:只打包 arm64-v8a ========== ndk { abiFilters.add("arm64-v8a") } } buildTypes { release { isMinifyEnabled = false proguardFiles( getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro" ) } } // ========== 新增:指定 CMakeLists.txt 路径 ========== externalNativeBuild { cmake { path = file("src/main/cpp/CMakeLists.txt") version = "3.22.1" } } compileOptions { sourceCompatibility = JavaVersion.VERSION_11 targetCompatibility = JavaVersion.VERSION_11 } buildFeatures { compose = true } } dependencies { implementation("androidx.appcompat:appcompat:1.6.1") implementation("com.google.android.material:material:1.11.0") implementation("androidx.constraintlayout:constraintlayout:2.1.4") implementation(libs.androidx.core.ktx) implementation(libs.androidx.lifecycle.runtime.ktx) implementation(libs.androidx.activity.compose) implementation(platform(libs.androidx.compose.bom)) implementation(libs.androidx.compose.ui) implementation(libs.androidx.compose.ui.graphics) implementation(libs.androidx.compose.ui.tooling.preview) implementation(libs.androidx.compose.material3) testImplementation(libs.junit) androidTestImplementation(libs.androidx.junit) androidTestImplementation(libs.androidx.espresso.core) androidTestImplementation(platform(libs.androidx.compose.bom)) androidTestImplementation(libs.androidx.compose.ui.test.junit4) debugImplementation(libs.androidx.compose.ui.tooling) debugImplementation(libs.androidx.compose.ui.test.manifest) }7. 预置SO库
把上一篇文件中编译生成的Android so放到对应的目录
app/src/main/jniLibs/arm64-v8a/*.so中:
8. 运行说明
我使用的是前面文章量《端侧AI 模型部署实战三(模型转换)》端侧AI 模型部署实战三(模型转换)化出来的库gemma-3-4b-it-q4_K_M.gguf,库自己push到的手机中。 手机如果没有root,库加载的时候会遇到权限问题,因为我使用的是个人手机没有解锁root,所以采用了通过用户选择文件授权的方式。
9. 运行结果演示(手机断网状态下):
10. 运行遇到难点
难点1:我下载的是今年3月份的llama.cpp b8648版本,比较新,在使用AI生成相关测试代码的时候,遇到多次编译和运行crash的问题,AI来回折腾搞了半天,最后通过提供给AI最新的源码,基于最新源码进行AI输出,具体做法:
把PC侧的llama-cli.exe的源码丢给AI,然后要求AI "参考官方 cli.cpp 移植的 JNI 完整版"
注意:模块的加载和推理生成这块代码的JNI接口移植是端侧的核心内容,基本的编程代码输出可以让AI处理,核心流程还是需要自己阅读代码深入了解。
11. 遗留问题
当前只移植了LLM, 多模态需要进一步支持。
输出生成慢,当前文本需要10s多,速度优化。
实时性行问题无法进行回答,RAG手机端实现。