Spaces:
Build error
Build error
| # TODO: there have been some issues with the workflow, so disabling for now | |
| # https://github.com/ggerganov/llama.cpp/issues/7893 | |
| # | |
| # Benchmark | |
| name: Benchmark | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| gpu-series: | |
| description: 'Azure GPU series to run with' | |
| required: true | |
| type: choice | |
| options: | |
| - Standard_NC4as_T4_v3 | |
| - Standard_NC24ads_A100_v4 | |
| - Standard_NC80adis_H100_v5 | |
| sha: | |
| description: 'Commit SHA1 to build' | |
| required: false | |
| type: string | |
| duration: | |
| description: 'Duration of the bench' | |
| type: string | |
| default: 10m | |
| push: | |
| branches: | |
| - master | |
| paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] | |
| pull_request_target: | |
| types: [opened, synchronize, reopened] | |
| paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] | |
| schedule: | |
| - cron: '04 2 * * *' | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} | |
| cancel-in-progress: true | |
| jobs: | |
| bench-server-baseline: | |
| runs-on: Standard_NC4as_T4_v3 | |
| env: | |
| RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it | |
| N_USERS: 8 | |
| DURATION: 10m | |
| strategy: | |
| matrix: | |
| model: [phi-2] | |
| ftype: [q4_0, q8_0, f16] | |
| include: | |
| - model: phi-2 | |
| ftype: q4_0 | |
| pr_comment_enabled: "true" | |
| if: | | |
| inputs.gpu-series == 'Standard_NC4as_T4_v3' | |
| || ( | |
| github.event_name == 'schedule' | |
| && github.ref_name == 'master' | |
| && github.repository_owner == 'ggerganov' | |
| ) | |
| || github.event_name == 'pull_request_target' | |
| || ( | |
| github.event_name == 'push' | |
| && github.event.ref == 'refs/heads/master' | |
| && github.repository_owner == 'ggerganov' | |
| ) | |
| steps: | |
| - name: Clone | |
| id: checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} | |
| - name: Install python env | |
| id: pipenv | |
| run: | | |
| cd examples/server/bench | |
| python3 -m venv venv | |
| source venv/bin/activate | |
| pip install -r requirements.txt | |
| - name: Prometheus | |
| id: install_prometheus | |
| run: | | |
| wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz | |
| tar xzf prometheus*.tar.gz --strip-components=1 | |
| ./prometheus --config.file=examples/server/bench/prometheus.yml & | |
| while ! nc -z localhost 9090; do | |
| sleep 0.1 | |
| done | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: '1.21' | |
| - name: Install k6 and xk6-sse | |
| id: k6_installation | |
| run: | | |
| cd examples/server/bench | |
| go install go.k6.io/xk6/cmd/xk6@latest | |
| xk6 build master \ | |
| --with github.com/phymbert/xk6-sse | |
| - name: Build | |
| id: cmake_build | |
| run: | | |
| set -eux | |
| cmake -B build \ | |
| -DGGML_NATIVE=OFF \ | |
| -DLLAMA_BUILD_SERVER=ON \ | |
| -DLLAMA_CURL=ON \ | |
| -DLLAMA_CUBLAS=ON \ | |
| -DCUDAToolkit_ROOT=/usr/local/cuda \ | |
| -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ | |
| -DCMAKE_CUDA_ARCHITECTURES=75 \ | |
| -DLLAMA_FATAL_WARNINGS=OFF \ | |
| -DLLAMA_ALL_WARNINGS=OFF \ | |
| -DCMAKE_BUILD_TYPE=Release; | |
| cmake --build build --config Release -j $(nproc) --target llama-server | |
| - name: Download the dataset | |
| id: download_dataset | |
| run: | | |
| cd examples/server/bench | |
| wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
| - name: Server bench | |
| id: server_bench | |
| env: | |
| HEAD_REF: ${{ github.head_ref || github.ref_name }} | |
| run: | | |
| set -eux | |
| cd examples/server/bench | |
| source venv/bin/activate | |
| python bench.py \ | |
| --runner-label ${{ env.RUNNER_LABEL }} \ | |
| --name ${{ github.job }} \ | |
| --branch $HEAD_REF \ | |
| --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ | |
| --scenario script.js \ | |
| --duration ${{ github.event.inputs.duration || env.DURATION }} \ | |
| --hf-repo ggml-org/models \ | |
| --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ | |
| --model-path-prefix /models \ | |
| --parallel ${{ env.N_USERS }} \ | |
| -ngl 33 \ | |
| --batch-size 2048 \ | |
| --ubatch-size 256 \ | |
| --ctx-size 16384 \ | |
| --n-prompts 1000 \ | |
| --max-prompt-tokens 1024 \ | |
| --max-tokens 2048 | |
| cat results.github.env >> $GITHUB_ENV | |
| # Remove dataset as we do not want it in the artefact | |
| rm ShareGPT_V3_unfiltered_cleaned_split.json | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | |
| compression-level: 9 | |
| path: | | |
| examples/server/bench/*.jpg | |
| examples/server/bench/*.json | |
| examples/server/bench/*.log | |
| - name: Commit status | |
| uses: Sibz/github-status-action@v1 | |
| with: | |
| authToken: ${{secrets.GITHUB_TOKEN}} | |
| sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} | |
| context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | |
| description: | | |
| ${{ env.BENCH_RESULTS }} | |
| state: 'success' | |
| - name: Upload benchmark images | |
| uses: devicons/[email protected] | |
| continue-on-error: true # Important as it looks unstable: 503 | |
| id: imgur_step | |
| with: | |
| client_id: ${{secrets.IMGUR_CLIENT_ID}} | |
| path: | | |
| examples/server/bench/prompt_tokens_seconds.jpg | |
| examples/server/bench/predicted_tokens_seconds.jpg | |
| examples/server/bench/kv_cache_usage_ratio.jpg | |
| examples/server/bench/requests_processing.jpg | |
| - name: Extract mermaid | |
| id: set_mermaid | |
| run: | | |
| set -eux | |
| cd examples/server/bench | |
| PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) | |
| echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV | |
| echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV | |
| echo "EOF" >> $GITHUB_ENV | |
| PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) | |
| echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV | |
| echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV | |
| echo "EOF" >> $GITHUB_ENV | |
| KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) | |
| echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV | |
| echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV | |
| echo "EOF" >> $GITHUB_ENV | |
| REQUESTS_PROCESSING=$(cat requests_processing.mermaid) | |
| echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV | |
| echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV | |
| echo "EOF" >> $GITHUB_ENV | |
| - name: Extract image url | |
| id: extract_image_url | |
| continue-on-error: true | |
| run: | | |
| set -eux | |
| echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV | |
| echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV | |
| echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV | |
| echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV | |
| - name: Comment PR | |
| uses: mshick/add-pr-comment@v2 | |
| id: comment_pr | |
| if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} | |
| with: | |
| message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | |
| message: | | |
| <p align="center"> | |
| π **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** π | |
| </p> | |
| <details> | |
| <summary>Expand details for performance related PR only</summary> | |
| - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} | |
| - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} | |
| - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s | |
| - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s | |
| - ${{ env.BENCH_GRAPH_XLABEL }} | |
| <p align="center"> | |
| <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" /> | |
| <details> | |
| <summary>More</summary> | |
| ```mermaid | |
| ${{ env.PROMPT_TOKENS_SECONDS }} | |
| ``` | |
| </details> | |
| <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/> | |
| <details> | |
| <summary>More</summary> | |
| ```mermaid | |
| ${{ env.PREDICTED_TOKENS_SECONDS }} | |
| ``` | |
| </details> | |
| </p> | |
| <details> | |
| <summary>Details</summary> | |
| <p align="center"> | |
| <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" /> | |
| <details> | |
| <summary>More</summary> | |
| ```mermaid | |
| ${{ env.KV_CACHE_USAGE_RATIO }} | |
| ``` | |
| </details> | |
| <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/> | |
| <details> | |
| <summary>More</summary> | |
| ```mermaid | |
| ${{ env.REQUESTS_PROCESSING }} | |
| ``` | |
| </details> | |
| </p> | |
| </details> | |
| </details> | |