Optimum neuron LLM inference cache builder #183

	name: Optimum neuron LLM inference cache builder

	on:
	# Manual trigger
	workflow_dispatch:

	# For branch and tag pushes
	push:
	tags:
	- 'v[0-9]+.[0-9]+.[0-9]+'
	pull_request:
	branches: [ main ]
	paths:
	- '.github/workflows/cache_llm.yml'
	- 'tools/cache/auto_fill_llm_cache.py'

	# Daily scheduled run
	schedule:
	- cron: '0 0 * * *'

	concurrency:
	group: ${{ github.workflow }}-${{ github.head_ref \|\| github.ref_name \|\| github.run_id }}
	cancel-in-progress: true

	jobs:
	sanity:
	name: Sanity
	runs-on: ubuntu-22.04
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	- name: Run sanity check
	uses: ./.github/actions/sanity-check
	with:
	hf_token: ${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CACHE }}

	cache:
	name: Create optimum-neuron LLM inference cache
	needs: sanity
	runs-on:
	group: aws-inf2-8xlarge
	strategy:
	fail-fast: false
	matrix:
	config: [
	trn1/llama3,
	trn1/llama4,
	trn1/mixtral,
	trn1/qwen3,
	trn1/qwen3-moe,
	trn1/granite,
	trn1/phi4,
	trn1/smollm3,
	trn2/llama3,
	trn2/llama4,
	trn2/qwen3-moe,
	]
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	- name: Install Neuronx runtime
	uses: ./.github/actions/install_neuronx_runtime
	- name: Setup virtual environment
	uses: ./.github/actions/setup_venv
	- name: Create cache for ${{matrix.config}} models
	run: \|
	source aws_neuron_venv_pytorch/bin/activate
	config_prefix_url=https://huggingface.co/aws-neuron/optimum-neuron-cache/raw/main/inference-cache-config
	HF_TOKEN=${{secrets.HF_TOKEN_OPTIMUM_NEURON_CACHE}} \
	python tools/cache/auto_fill_llm_cache.py --config_file ${config_prefix_url}/${{matrix.config}}.json

Provide feedback