set -ex
# 1. check num GPUs
GPU_COUNT=$(python3 -c "import torch;
print(torch.cuda.device_count())")
echo "Starting serving model name: ${MODEL_NAME}, num gpus:
${GPU_COUNT}"
if [ ${GPU_COUNT} -lt 1 ]; then
echo "No GPUs found. Please check if the container have aquired any GPU device"
exit 1
fi
# 2. check model path
MODEL_DIR="/mnt/models/${MODEL_NAME}"
# a. using git lfs storage initializer, model will be in
/mnt/models/<model_name>
# b. using hf storage initializer, model will be in /mnt/models
if [ ! -d "${MODEL_DIR}" ]; then
MODEL_DIR="/mnt/models"
echo "[WARNING] Model directory ${MODEL_DIR}/${MODEL_NAME} not found, using ${MODEL_DIR} instead"
fi
# 3. check if using gguf models
c=`find "${MODEL_DIR}" -maxdepth 1 -type f -name '*.gguf' | wc -l`
echo "find ${c} gguf files"
if [ "${c}" -gt 1 ]; then
echo "[ERROR] More than one gguf file found in ${MODEL_DIR}"
echo "Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use gguf-split tool to merge them to a single-file model."
exit 1
elif [ "${c}" -eq 1 ]; then
n=`find "${MODEL_DIR}" -maxdepth 1 -type f -name '*.gguf' -print`
echo "[INFO] Using GGUF model file: ${n}"
MODEL_PATH="${n}"
else
echo "[INFO] Using standard model directory"
MODEL_PATH="${MODEL_DIR}"
fi
# 4. launch vllm server
if [ "$ENABLE_CHUNKED_PREFILL" = "False" ]; then
PARAM_ENABLE_CHUNKED_PREFILL="--no-enable-chunked-prefill"
else
PARAM_ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
fi
if [ "$ENFORCE_EAGER" = "True" ]; then
PARAM_ENFORCE_EAGER="--enforce-eager"
else
PARAM_ENFORCE_EAGER=""
fi
vllm serve ${MODEL_PATH} --task score --port 8080 --served-model-name
{{.Name}} {{.Namespace}}/{{.Name}}