diff --git a/rllm/llama-cpp-low/Cargo.toml b/rllm/llama-cpp-low/Cargo.toml index 399ae68a..8dac2412 100644 --- a/rllm/llama-cpp-low/Cargo.toml +++ b/rllm/llama-cpp-low/Cargo.toml @@ -16,3 +16,6 @@ cmake = "0.1.50" [features] default = [] cuda = [] +sycl = [] +sycl_fp16 = [] +sycl_nvidia = [] diff --git a/rllm/llama-cpp-low/build.rs b/rllm/llama-cpp-low/build.rs index 2ee2cab6..acdc1d16 100644 --- a/rllm/llama-cpp-low/build.rs +++ b/rllm/llama-cpp-low/build.rs @@ -5,7 +5,14 @@ const SUBMODULE_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/llama.cpp"); fn main() { let ccache = true; - let cuda = std::env::var("CARGO_FEATURE_CUDA").unwrap_or(String::new()); + let flag_cuda = env::var("CARGO_FEATURE_CUDA").unwrap_or(String::new()) == "1"; + let flag_sycl = env::var("CARGO_FEATURE_SYCL").unwrap_or(String::new()) == "1"; + let flag_sycl_fp16 = env::var("CARGO_FEATURE_SYCL_FP16").unwrap_or(String::new()) == "1"; + let flag_sycl_nvidia = env::var("CARGO_FEATURE_SYCL_NVIDIA").unwrap_or(String::new()) == "1"; + + // oneAPI environment variables + let mkl_root = env::var("MKLROOT"); + let cmplr_root = env::var("CMPLR_ROOT"); let submodule_dir = &PathBuf::from(SUBMODULE_DIR); let header_path = submodule_dir.join("llama.h"); @@ -29,15 +36,66 @@ fn main() { .configure_arg("-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"); } - if cuda == "1" { + if flag_cuda && flag_sycl { + panic!("Only cuda or sycl can be activated at the same time!"); + } + if flag_cuda { cmake.configure_arg("-DLLAMA_CUBLAS=ON"); println!("cargo:rustc-link-search=/usr/local/cuda/lib64"); println!("cargo:rustc-link-lib=cuda"); println!("cargo:rustc-link-lib=cudart"); println!("cargo:rustc-link-lib=cublas"); println!("cargo:rustc-link-lib=cupti"); - } + } else if flag_sycl { + assert!(mkl_root.is_ok(), "MKLROOT is not set (plz `source /opt/intel/oneapi/setvars.sh` if OneAPI is installed)"); + assert!(cmplr_root.is_ok(), "ICPP_COMPILER_ROOT is not set"); + let mkl_root_str = mkl_root.unwrap(); + //let cmplr_root_str = cmplr_root.unwrap(); + + cmake + .define("LLAMA_SYCL", "ON") + .define("CMAKE_C_COMPILER", "icx") + .define("CMAKE_CXX_COMPILER", "icpx"); + println!("cargo:rustc-link-arg=-fiopenmp"); + println!("cargo:rustc-link-arg=-fopenmp-targets=spir64_gen"); + println!("cargo:rustc-link-arg=-fsycl"); + println!("cargo:rustc-link-arg=-Wl,--no-as-needed"); + println!("cargo:rustc-link-arg=-Wno-narrowing"); + println!("cargo:rustc-link-arg=-O3"); + //println!("cargo:rustc-link-search=native={}/lib", cmplr_root_str); + println!("cargo:rustc-link-search=native={}/lib", mkl_root_str); + println!("cargo:rustc-link-lib=svml"); + println!("cargo:rustc-link-lib=mkl_sycl_blas"); + println!("cargo:rustc-link-lib=mkl_sycl_lapack"); + println!("cargo:rustc-link-lib=mkl_sycl_dft"); + println!("cargo:rustc-link-lib=mkl_sycl_sparse"); + println!("cargo:rustc-link-lib=mkl_sycl_vm"); + println!("cargo:rustc-link-lib=mkl_sycl_rng"); + println!("cargo:rustc-link-lib=mkl_sycl_stats"); + println!("cargo:rustc-link-lib=mkl_sycl_data_fitting"); + println!("cargo:rustc-link-lib=mkl_intel_ilp64"); + println!("cargo:rustc-link-lib=mkl_intel_thread"); + println!("cargo:rustc-link-lib=mkl_tbb_thread"); + println!("cargo:rustc-link-lib=mkl_core"); + println!("cargo:rustc-link-lib=iomp5"); + println!("cargo:rustc-link-lib=sycl"); + println!("cargo:rustc-link-lib=pthread"); + println!("cargo:rustc-link-lib=m"); + println!("cargo:rustc-link-lib=dl"); + println!("cargo:rustc-link-lib=intlc"); + println!("cargo:rustc-link-lib=imf"); + //println!("cargo:rustc-link-lib=static=ggml_sycl"); + //println!("cargo:rustc-link-arg=") + } + if flag_sycl_fp16 { + cmake.configure_arg("-DLLAMA_SYCL_F16=ON"); + } + if flag_sycl_nvidia { + cmake.configure_arg("-DLLAMA_SYCL_TARGET=NVIDIA"); + } + cmake.very_verbose(true); + let dst = cmake.build(); println!("cargo:rustc-link-search=native={}/lib", dst.display()); diff --git a/rllm/rllm-cuda/server.sh b/rllm/rllm-cuda/server.sh index 3c6ca131..a8dd7b14 100755 --- a/rllm/rllm-cuda/server.sh +++ b/rllm/rllm-cuda/server.sh @@ -41,6 +41,42 @@ while [ "$1" != "" ] ; do exit 1 fi ;; + --sycl ) + if [ "$CPP" = 1 ] ; then + VER="$VER --features sycl" + ADD_ARGS="--gpu-layers 1000" + else + echo "--sycl only valid for llama.cpp" + exit 1 + fi + ;; + --sycl-fp16 ) + if [ "$CPP" = 1 ] ; then + VER="$VER --features sycl,sycl_fp16" + ADD_ARGS="--gpu-layers 1000" + else + echo "--sycl-fp16 only valid for llama.cpp" + exit 1 + fi + ;; + --sycl-nvidia ) + if [ "$CPP" = 1 ] ; then + VER="$VER --features sycl,sycl_nvidia" + ADD_ARGS="--gpu-layers 1000" + else + echo "--sycl-nvidia only valid for llama.cpp" + exit 1 + fi + ;; + --sycl-nvidia-fp16 ) + if [ "$CPP" = 1 ] ; then + VER="$VER --features sycl,sycl_nvidia,sycl_fp16" + ADD_ARGS="--gpu-layers 1000" + else + echo "--sycl-nvidia-fp16 only valid for llama.cpp" + exit 1 + fi + ;; --trace ) R_LOG=info,tokenizers=error,rllm=trace,aicirt=info,llama_cpp_low=trace ;; @@ -84,7 +120,7 @@ if [ "$CPP" = 1 ] ; then * ) SELF="server.sh" cat <